Fixed macOS build.

Fix compile warnings.
Add renaming ASM codes & update from upstream.
62 changed files with 8677 additions and 1261 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,12 @@
+# v0.9.0
+- **[#753](https://github.com/xmrig/xmrig/issues/753) Added new algorithm [CryptoNight variant 2](https://github.com/xmrig/xmrig/issues/753) for Monero fork, thanks [@SChernykh](https://github.com/SChernykh).**
+  - Added option `--asm`, possible values `--asm auto`, `--asm none`, `--asm intel` and `--asm ryzen`.
+- Added support for new style long and short algorithm names, possible values: `cryptonight`, `cryptonight/0`, `cryptonight/1`, `cryptonight/2`, `cryptonight-lite`, `cryptonight-lite/0`, `cryptonight-lite/1` and short equvalents `cn/2` etc. 
+- Added `--variant`, example `--algo cn --variant 2`, by default miner automaticaly detect proper variant for Monero by block version.  
+- Added CryptoNight-Lite variant 1.
+- Added xmrig-proxy autodetection, nicehash will be enabled automaticaly. 
+- Added workaround for xmrig-proxy [bug](https://github.com/xmrig/xmrig-proxy/commit/dfa1960fe3eeb13f80717b7dbfcc7c6e9f222d89).
+
 # v0.8.2
 - Fixed L2 cache size detection for AMD CPUs (Bulldozer/Piledriver/Steamroller/Excavator architecture).
 - Fixed gcc 7.1 support.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -3,22 +3,26 @@ project(xmrig C)

 option(WITH_LIBCPUID "Use Libcpuid" ON)
 option(WITH_AEON     "CryptoNight-Lite support" ON)
+option(WITH_ASM      "Enable ASM PoW implementations" ON)

 set(HEADERS
-    compat.h
    algo/cryptonight/cryptonight.h
    algo/cryptonight/cryptonight_aesni.h
+    algo/cryptonight/cryptonight_monero.h
    algo/cryptonight/cryptonight_softaes.h
+    algo/cryptonight/cryptonight_test.h
+    algo/cryptonight/variant4_random_math.h
+    compat.h
+    cpu.h
+    donate.h
    elist.h
-    xmrig.h
-    version.h
    options.h
-    cpu.h
    persistent_memory.h
-    stratum.h
    stats.h
+    stratum.h
    util.h
-    donate.h
+    version.h
+    xmrig.h
   )

 set(HEADERS_CRYPTO
@ -26,6 +30,7 @@ set(HEADERS_CRYPTO
    crypto/c_blake256.h
    crypto/c_jh.h
    crypto/c_skein.h
+    crypto/soft_aes.h
   )

 set(HEADERS_COMPAT
@ -41,10 +46,14 @@ set(HEADERS_UTILS
 set(SOURCES
    xmrig.c
    algo/cryptonight/cryptonight.c
-    algo/cryptonight/cryptonight_av1_aesni.c
-    algo/cryptonight/cryptonight_av2_aesni_double.c
-    algo/cryptonight/cryptonight_av3_softaes.c
-    algo/cryptonight/cryptonight_av4_softaes_double.c
+    algo/cryptonight/cryptonight_av1.c
+    algo/cryptonight/cryptonight_av2.c
+    algo/cryptonight/cryptonight_av3.c
+    algo/cryptonight/cryptonight_av4.c
+    algo/cryptonight/cryptonight_r_av1.c
+    algo/cryptonight/cryptonight_r_av2.c
+    algo/cryptonight/cryptonight_r_av3.c
+    algo/cryptonight/cryptonight_r_av4.c
    util.c
    options.c
    stratum.c
@ -58,7 +67,6 @@ set(SOURCES_CRYPTO
    crypto/c_blake256.c
    crypto/c_jh.c
    crypto/c_skein.c
-    crypto/soft_aes.c
   )

 set(SOURCES_UTILS
@ -68,13 +76,13 @@ set(SOURCES_UTILS

 if (WIN32)
    set(SOURCES_OS win/cpu_win.c win/memory_win.c win/xmrig_win.c win/app.rc compat/winansi.c)
-    set(EXTRA_LIBS ws2_32)
+    set(EXTRA_LIBS ws2_32 crypt32)
    add_definitions(/D_WIN32_WINNT=0x600)
 elseif (APPLE)
    set(SOURCES_OS mac/cpu_mac.c mac/memory_mac.c mac/xmrig_mac.c)
 else()
    set(SOURCES_OS unix/cpu_unix.c unix/memory_unix.c unix/xmrig_unix.c)
-    set(EXTRA_LIBS pthread)
+    set(EXTRA_LIBS pthread rt m)
 endif()

 include_directories(.)
@ -89,9 +97,9 @@ endif()
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes -Wno-pointer-to-int-cast")

 if (CMAKE_C_COMPILER_ID MATCHES "Clang")
-    set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Ofast -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants")
+    set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Ofast -s -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants")
 else()
-    set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Ofast -funroll-loops -fvariable-expansion-in-unroller -ftree-loop-if-convert-stores -fmerge-all-constants -fbranch-target-load-optimize2")
+    set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Ofast -s -funroll-loops -fvariable-expansion-in-unroller -ftree-loop-if-convert-stores -fmerge-all-constants -fbranch-target-load-optimize2")
 endif()

 #set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -gdwarf-2")
@ -123,12 +131,14 @@ else()
    set(SOURCES_CPUID cpu_stub.c)
 endif()

+include(cmake/asm.cmake)
+
 if (WITH_AEON)
    set(SOURCES_AEON
-    algo/cryptonight-lite/cryptonight_lite_av1_aesni.c
-    algo/cryptonight-lite/cryptonight_lite_av2_aesni_double.c
-    algo/cryptonight-lite/cryptonight_lite_av3_softaes.c
-    algo/cryptonight-lite/cryptonight_lite_av4_softaes_double.c
+    algo/cryptonight-lite/cryptonight_lite_av1.c
+    algo/cryptonight-lite/cryptonight_lite_av2.c
+    algo/cryptonight-lite/cryptonight_lite_av3.c
+    algo/cryptonight-lite/cryptonight_lite_av4.c
    algo/cryptonight-lite/cryptonight_lite_aesni.h
    algo/cryptonight-lite/cryptonight_lite_softaes.h
    )
@ -137,10 +147,10 @@ else()
 endif()

 if (CMAKE_SIZEOF_VOID_P EQUAL 8)
-    add_executable(xmrig ${HEADERS} ${HEADERS_CRYPTO} ${SOURCES} ${SOURCES_CRYPTO} ${HEADERS_UTILS} ${SOURCES_UTILS} ${HEADERS_COMPAT} ${SOURCES_COMPAT} ${SOURCES_OS} ${SOURCES_CPUID} ${SOURCES_AEON})
-    target_link_libraries(xmrig jansson curl ${CPUID_LIB} ${EXTRA_LIBS})
+    add_executable(xmrig ${HEADERS} ${HEADERS_CRYPTO} ${SOURCES} ${SOURCES_CRYPTO} ${HEADERS_UTILS} ${SOURCES_UTILS} ${HEADERS_COMPAT} ${SOURCES_COMPAT} ${SOURCES_OS} ${SOURCES_CPUID} ${SOURCES_AEON} ${XMRIG_ASM_SOURCES})
+    target_link_libraries(xmrig ${XMRIG_ASM_LIBRARY} jansson ${CURL_LIBRARY} ${CPUID_LIB} ${EXTRA_LIBS})
 else()
-    add_executable(xmrig32 ${HEADERS} ${HEADERS_CRYPTO} ${SOURCES} ${SOURCES_CRYPTO} ${HEADERS_UTILS} ${SOURCES_UTILS} ${HEADERS_COMPAT} ${SOURCES_COMPAT} ${SOURCES_OS} ${SOURCES_CPUID} ${SOURCES_AEON})
-    target_link_libraries(xmrig32 jansson curl ${CPUID_LIB} ${EXTRA_LIBS})
+    add_executable(xmrig32 ${HEADERS} ${HEADERS_CRYPTO} ${SOURCES} ${SOURCES_CRYPTO} ${HEADERS_UTILS} ${SOURCES_UTILS} ${HEADERS_COMPAT} ${SOURCES_COMPAT} ${SOURCES_OS} ${SOURCES_CPUID} ${SOURCES_AEON} ${XMRIG_ASM_SOURCES})
+    target_link_libraries(xmrig32 ${XMRIG_ASM_LIBRARY} jansson ${CURL_LIBRARY} ${CPUID_LIB} ${EXTRA_LIBS})
 endif()

--- a/README.md
+++ b/README.md
@ -99,7 +99,7 @@ Configure options for libcurl:
 ```
 CMake options:
 ```
-cmake .. -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCURL_INCLUDE_DIR="c:\<path>\curl-7.53.1\include" -DCURL_LIBRARY="c:\<path>\curl-7.53.1\lib\.libs"
+cmake .. -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCURL_INCLUDE_DIR="c:\xmrig-deps\gcc\x64\include" -DCURL_LIBRARY="c:\xmrig-deps\gcc\x64\lib\libcurl.a"
 ```

 ### Optional features
--- a/algo/cryptonight-lite/cryptonight_lite_aesni.h
+++ b/algo/cryptonight-lite/cryptonight_lite_aesni.h
@ -22,10 +22,12 @@
 *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef __CRYPTONIGHT_LITE_AESNI_H__
-#define __CRYPTONIGHT_LITE_AESNI_H__
+#ifndef XMRIG_CRYPTONIGHT_LITE_AESNI_H
+#define XMRIG_CRYPTONIGHT_LITE_AESNI_H
+

 #include <x86intrin.h>
+#include <stdint.h>


 #define aes_genkey_sub(imm8) \
@ -253,4 +255,20 @@ static inline uint64_t _umul128(uint64_t multiplier, uint64_t multiplicand, uint
 #endif


-#endif /* __CRYPTONIGHT_LITE_AESNI_H__ */
+static inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
+{
+    mem_out[0] = EXTRACT64(tmp);
+
+    tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
+    uint64_t vh = EXTRACT64(tmp);
+
+    uint8_t x = vh >> 24;
+    static const uint16_t table = 0x7531;
+    const uint8_t index = (((x >> 3) & 6) | (x & 1)) << 1;
+    vh ^= ((table >> index) & 0x3) << 28;
+
+    mem_out[1] = vh;
+}
+
+
+#endif /* XMRIG_CRYPTONIGHT_LITE_AESNI_H */
--- a/algo/cryptonight-lite/cryptonight_lite_av1.c
+++ b/algo/cryptonight-lite/cryptonight_lite_av1.c
@ -0,0 +1,134 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2016-2018 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <x86intrin.h>
+#include <string.h>
+
+#include "algo/cryptonight/cryptonight.h"
+#include "algo/cryptonight/cryptonight_monero.h"
+#include "crypto/c_keccak.h"
+#include "cryptonight_lite_aesni.h"
+
+
+void cryptonight_lite_av1_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input, size, ctx[0]->state, 200);
+
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) {
+        __m128i cx;
+        cx = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]);
+        cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
+
+        _mm_store_si128((__m128i *) &l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx));
+        idx0 = EXTRACT64(cx);
+        bx0 = cx;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0xFFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0xFFFF0])[1] = ah0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+    }
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+
+    keccakf(h0, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+
+
+void cryptonight_lite_av1_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    if (size < 43) {
+        memset(output, 0, 32);
+        return;
+    }
+
+    keccak(input, size, ctx[0]->state, 200);
+
+    VARIANT1_INIT(0);
+
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) {
+        __m128i cx;
+        cx = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]);
+        cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
+
+        cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx));
+
+        idx0 = EXTRACT64(cx);
+        bx0 = cx;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0xFFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0xFFFF0])[1] = ah0 ^ tweak1_2_0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+    }
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+
+    keccakf(h0, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
--- a/algo/cryptonight-lite/cryptonight_lite_av1_aesni.c
+++ b/algo/cryptonight-lite/cryptonight_lite_av1_aesni.c
@ -1,77 +0,0 @@
-/* XMRig
- * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
- * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
- * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
- * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
- * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
- *
- *   This program is free software: you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, either version 3 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <x86intrin.h>
-#include <string.h>
-
-#include "algo/cryptonight/cryptonight.h"
-#include "cryptonight_lite_aesni.h"
-#include "crypto/c_keccak.h"
-
-
-void cryptonight_lite_av1_aesni(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx)
-{
-    keccak((const uint8_t *) input, size, ctx->state0, 200);
-
-    cn_explode_scratchpad((__m128i*) ctx->state0, (__m128i*) ctx->memory);
-
-    const uint8_t* l0 = ctx->memory;
-    uint64_t* h0 = (uint64_t*) ctx->state0;
-
-    uint64_t al0 = h0[0] ^ h0[4];
-    uint64_t ah0 = h0[1] ^ h0[5];
-    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-
-    uint64_t idx0 = h0[0] ^ h0[4];
-
-    for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) {
-        __m128i cx;
-        cx = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]);
-        cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
-
-        _mm_store_si128((__m128i *) &l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx));
-        idx0 = EXTRACT64(cx);
-        bx0 = cx;
-
-        uint64_t hi, lo, cl, ch;
-        cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0];
-        ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1];
-        lo = _umul128(idx0, cl, &hi);
-
-        al0 += hi;
-        ah0 += lo;
-
-        ((uint64_t*)&l0[idx0 & 0xFFFF0])[0] = al0;
-        ((uint64_t*)&l0[idx0 & 0xFFFF0])[1] = ah0;
-
-        ah0 ^= ch;
-        al0 ^= cl;
-        idx0 = al0;
-    }
-
-    cn_implode_scratchpad((__m128i*) ctx->memory, (__m128i*) ctx->state0);
-
-    keccakf(h0, 24);
-    extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output);
-}
--- a/algo/cryptonight-lite/cryptonight_lite_av2.c
+++ b/algo/cryptonight-lite/cryptonight_lite_av2.c
@ -0,0 +1,202 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2016-2018 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <x86intrin.h>
+#include <string.h>
+
+#include "algo/cryptonight/cryptonight.h"
+#include "algo/cryptonight/cryptonight_monero.h"
+#include "cryptonight_lite_aesni.h"
+#include "crypto/c_keccak.h"
+
+
+void cryptonight_lite_av2_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    const uint8_t* l1 = ctx[1]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+    uint64_t* h1 = (uint64_t*) ctx[1]->state;
+
+    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
+    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t al1 = h1[0] ^ h1[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    uint64_t ah1 = h1[1] ^ h1[5];
+
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+    uint64_t idx1 = h1[0] ^ h1[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) {
+        __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]);
+        __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0xFFFF0]);
+
+        cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
+        cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
+
+        _mm_store_si128((__m128i *) &l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx0));
+        _mm_store_si128((__m128i *) &l1[idx1 & 0xFFFF0], _mm_xor_si128(bx1, cx1));
+
+        idx0 = EXTRACT64(cx0);
+        idx1 = EXTRACT64(cx1);
+
+        bx0 = cx0;
+        bx1 = cx1;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*) &l0[idx0 & 0xFFFF0])[0] = al0;
+        ((uint64_t*) &l0[idx0 & 0xFFFF0])[1] = ah0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+
+        cl = ((uint64_t*) &l1[idx1 & 0xFFFF0])[0];
+        ch = ((uint64_t*) &l1[idx1 & 0xFFFF0])[1];
+        lo = _umul128(idx1, cl, &hi);
+
+        al1 += hi;
+        ah1 += lo;
+
+        ((uint64_t*) &l1[idx1 & 0xFFFF0])[0] = al1;
+        ((uint64_t*) &l1[idx1 & 0xFFFF0])[1] = ah1;
+
+        ah1 ^= ch;
+        al1 ^= cl;
+        idx1 = al1;
+    }
+
+    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
+    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
+
+    keccakf(h0, 24);
+    keccakf(h1, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, (char*) output + 32);
+}
+
+
+void cryptonight_lite_av2_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    if (size < 43) {
+        memset(output, 0, 64);
+        return;
+    }
+
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+
+    VARIANT1_INIT(0);
+    VARIANT1_INIT(1);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    const uint8_t* l1 = ctx[1]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+    uint64_t* h1 = (uint64_t*) ctx[1]->state;
+
+    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
+    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t al1 = h1[0] ^ h1[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    uint64_t ah1 = h1[1] ^ h1[5];
+
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+    uint64_t idx1 = h1[0] ^ h1[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) {
+        __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]);
+        __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0xFFFF0]);
+
+        cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
+        cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
+
+        cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx0));
+        cryptonight_monero_tweak((uint64_t*)&l1[idx1 & 0xFFFF0], _mm_xor_si128(bx1, cx1));
+
+        idx0 = EXTRACT64(cx0);
+        idx1 = EXTRACT64(cx1);
+
+        bx0 = cx0;
+        bx1 = cx1;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*) &l0[idx0 & 0xFFFF0])[0] = al0;
+        ((uint64_t*) &l0[idx0 & 0xFFFF0])[1] = ah0 ^ tweak1_2_0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+
+        cl = ((uint64_t*) &l1[idx1 & 0xFFFF0])[0];
+        ch = ((uint64_t*) &l1[idx1 & 0xFFFF0])[1];
+        lo = _umul128(idx1, cl, &hi);
+
+        al1 += hi;
+        ah1 += lo;
+
+        ((uint64_t*) &l1[idx1 & 0xFFFF0])[0] = al1;
+        ((uint64_t*) &l1[idx1 & 0xFFFF0])[1] = ah1 ^ tweak1_2_1;
+
+        ah1 ^= ch;
+        al1 ^= cl;
+        idx1 = al1;
+    }
+
+    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
+    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
+
+    keccakf(h0, 24);
+    keccakf(h1, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, (char*) output + 32);
+}
--- a/algo/cryptonight-lite/cryptonight_lite_av2_aesni_double.c
+++ b/algo/cryptonight-lite/cryptonight_lite_av2_aesni_double.c
@ -1,111 +0,0 @@
-/* XMRig
- * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
- * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
- * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
- * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
- * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
- *
- *   This program is free software: you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, either version 3 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <x86intrin.h>
-#include <string.h>
-
-#include "algo/cryptonight/cryptonight.h"
-#include "cryptonight_lite_aesni.h"
-#include "crypto/c_keccak.h"
-
-
-void cryptonight_lite_av2_aesni_double(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx)
-{
-    keccak((const uint8_t *) input,        size, ctx->state0, 200);
-    keccak((const uint8_t *) input + size, size, ctx->state1, 200);
-
-    const uint8_t* l0 = ctx->memory;
-    const uint8_t* l1 = ctx->memory + MEMORY_LITE;
-    uint64_t* h0 = (uint64_t*) ctx->state0;
-    uint64_t* h1 = (uint64_t*) ctx->state1;
-
-    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
-    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
-
-    uint64_t al0 = h0[0] ^ h0[4];
-    uint64_t al1 = h1[0] ^ h1[4];
-    uint64_t ah0 = h0[1] ^ h0[5];
-    uint64_t ah1 = h1[1] ^ h1[5];
-
-    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-    __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
-
-    uint64_t idx0 = h0[0] ^ h0[4];
-    uint64_t idx1 = h1[0] ^ h1[4];
-
-    for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) {
-        __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]);
-        __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0xFFFF0]);
-
-        cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
-        cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
-
-        _mm_store_si128((__m128i *) &l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx0));
-        _mm_store_si128((__m128i *) &l1[idx1 & 0xFFFF0], _mm_xor_si128(bx1, cx1));
-
-        idx0 = EXTRACT64(cx0);
-        idx1 = EXTRACT64(cx1);
-
-        bx0 = cx0;
-        bx1 = cx1;
-
-        uint64_t hi, lo, cl, ch;
-        cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0];
-        ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1];
-        lo = _umul128(idx0, cl, &hi);
-
-        al0 += hi;
-        ah0 += lo;
-
-        ((uint64_t*) &l0[idx0 & 0xFFFF0])[0] = al0;
-        ((uint64_t*) &l0[idx0 & 0xFFFF0])[1] = ah0;
-
-        ah0 ^= ch;
-        al0 ^= cl;
-        idx0 = al0;
-
-        cl = ((uint64_t*) &l1[idx1 & 0xFFFF0])[0];
-        ch = ((uint64_t*) &l1[idx1 & 0xFFFF0])[1];
-        lo = _umul128(idx1, cl, &hi);
-
-        al1 += hi;
-        ah1 += lo;
-
-        ((uint64_t*) &l1[idx1 & 0xFFFF0])[0] = al1;
-        ((uint64_t*) &l1[idx1 & 0xFFFF0])[1] = ah1;
-
-        ah1 ^= ch;
-        al1 ^= cl;
-        idx1 = al1;
-    }
-
-    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
-    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
-
-    keccakf(h0, 24);
-    keccakf(h1, 24);
-
-    extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output);
-    extra_hashes[ctx->state1[0] & 3](ctx->state1, 200, (char*) output + 32);
-}
--- a/algo/cryptonight-lite/cryptonight_lite_av3.c
+++ b/algo/cryptonight-lite/cryptonight_lite_av3.c
@ -0,0 +1,134 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2016-2018 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <x86intrin.h>
+#include <string.h>
+
+#include "algo/cryptonight/cryptonight.h"
+#include "algo/cryptonight/cryptonight_monero.h"
+#include "cryptonight_lite_softaes.h"
+#include "crypto/c_keccak.h"
+
+
+void cryptonight_lite_av3_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input, size, ctx[0]->state, 200);
+
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) {
+        __m128i cx;
+        cx = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]);
+        cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0));
+
+        _mm_store_si128((__m128i *) &l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx));
+        idx0 = EXTRACT64(cx);
+        bx0 = cx;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0xFFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0xFFFF0])[1] = ah0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+    }
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+
+    keccakf(h0, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+
+
+void cryptonight_lite_av3_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    if (size < 43) {
+        memset(output, 0, 32);
+        return;
+    }
+
+    keccak(input, size, ctx[0]->state, 200);
+
+    VARIANT1_INIT(0);
+
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) {
+        __m128i cx;
+        cx = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]);
+        cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0));
+
+        cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx));
+
+        idx0 = EXTRACT64(cx);
+        bx0 = cx;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0xFFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0xFFFF0])[1] = ah0 ^ tweak1_2_0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+    }
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+
+    keccakf(h0, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
--- a/algo/cryptonight-lite/cryptonight_lite_av3_softaes.c
+++ b/algo/cryptonight-lite/cryptonight_lite_av3_softaes.c
@ -1,77 +0,0 @@
-/* XMRig
- * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
- * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
- * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
- * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
- * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
- *
- *   This program is free software: you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, either version 3 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <x86intrin.h>
-#include <string.h>
-
-#include "algo/cryptonight/cryptonight.h"
-#include "cryptonight_lite_softaes.h"
-#include "crypto/c_keccak.h"
-
-
-void cryptonight_lite_av3_softaes(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx)
-{
-    keccak((const uint8_t *) input, size, ctx->state0, 200);
-
-    cn_explode_scratchpad((__m128i*) ctx->state0, (__m128i*) ctx->memory);
-
-    const uint8_t* l0 = ctx->memory;
-    uint64_t* h0 = (uint64_t*) ctx->state0;
-
-    uint64_t al0 = h0[0] ^ h0[4];
-    uint64_t ah0 = h0[1] ^ h0[5];
-    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-
-    uint64_t idx0 = h0[0] ^ h0[4];
-
-    for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) {
-        __m128i cx;
-        cx = _mm_load_si128((__m128i *)&l0[idx0 & 0xFFFF0]);
-        cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0));
-
-        _mm_store_si128((__m128i *)&l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx));
-        idx0 = EXTRACT64(cx);
-        bx0 = cx;
-
-        uint64_t hi, lo, cl, ch;
-        cl = ((uint64_t*)&l0[idx0 & 0xFFFF0])[0];
-        ch = ((uint64_t*)&l0[idx0 & 0xFFFF0])[1];
-        lo = _umul128(idx0, cl, &hi);
-
-        al0 += hi;
-        ah0 += lo;
-
-        ((uint64_t*)&l0[idx0 & 0xFFFF0])[0] = al0;
-        ((uint64_t*)&l0[idx0 & 0xFFFF0])[1] = ah0;
-
-        ah0 ^= ch;
-        al0 ^= cl;
-        idx0 = al0;
-    }
-
-    cn_implode_scratchpad((__m128i*) ctx->memory, (__m128i*) ctx->state0);
-
-    keccakf(h0, 24);
-    extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output);
-}
--- a/algo/cryptonight-lite/cryptonight_lite_av4.c
+++ b/algo/cryptonight-lite/cryptonight_lite_av4.c
@ -0,0 +1,202 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2016-2018 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <x86intrin.h>
+#include <string.h>
+
+#include "algo/cryptonight/cryptonight.h"
+#include "algo/cryptonight/cryptonight_monero.h"
+#include "cryptonight_lite_softaes.h"
+#include "crypto/c_keccak.h"
+
+
+void cryptonight_lite_av4_v0(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    const uint8_t* l1 = ctx[1]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+    uint64_t* h1 = (uint64_t*) ctx[1]->state;
+
+    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
+    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t al1 = h1[0] ^ h1[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    uint64_t ah1 = h1[1] ^ h1[5];
+
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+    uint64_t idx1 = h1[0] ^ h1[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) {
+        __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]);
+        __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0xFFFF0]);
+
+        cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0));
+        cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1));
+
+        _mm_store_si128((__m128i *) &l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx0));
+        _mm_store_si128((__m128i *) &l1[idx1 & 0xFFFF0], _mm_xor_si128(bx1, cx1));
+
+        idx0 = EXTRACT64(cx0);
+        idx1 = EXTRACT64(cx1);
+
+        bx0 = cx0;
+        bx1 = cx1;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*) &l0[idx0 & 0xFFFF0])[0] = al0;
+        ((uint64_t*) &l0[idx0 & 0xFFFF0])[1] = ah0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+
+        cl = ((uint64_t*) &l1[idx1 & 0xFFFF0])[0];
+        ch = ((uint64_t*) &l1[idx1 & 0xFFFF0])[1];
+        lo = _umul128(idx1, cl, &hi);
+
+        al1 += hi;
+        ah1 += lo;
+
+        ((uint64_t*) &l1[idx1 & 0xFFFF0])[0] = al1;
+        ((uint64_t*) &l1[idx1 & 0xFFFF0])[1] = ah1;
+
+        ah1 ^= ch;
+        al1 ^= cl;
+        idx1 = al1;
+    }
+
+    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
+    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
+
+    keccakf(h0, 24);
+    keccakf(h1, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
+
+
+void cryptonight_lite_av4_v1(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    if (size < 43) {
+        memset(output, 0, 64);
+        return;
+    }
+
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+
+    VARIANT1_INIT(0);
+    VARIANT1_INIT(1);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    const uint8_t* l1 = ctx[1]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+    uint64_t* h1 = (uint64_t*) ctx[1]->state;
+
+    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
+    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t al1 = h1[0] ^ h1[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    uint64_t ah1 = h1[1] ^ h1[5];
+
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+    uint64_t idx1 = h1[0] ^ h1[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) {
+        __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]);
+        __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0xFFFF0]);
+
+        cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0));
+        cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1));
+
+        cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx0));
+        cryptonight_monero_tweak((uint64_t*)&l1[idx1 & 0xFFFF0], _mm_xor_si128(bx1, cx1));
+
+        idx0 = EXTRACT64(cx0);
+        idx1 = EXTRACT64(cx1);
+
+        bx0 = cx0;
+        bx1 = cx1;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*) &l0[idx0 & 0xFFFF0])[0] = al0;
+        ((uint64_t*) &l0[idx0 & 0xFFFF0])[1] = ah0 ^ tweak1_2_0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+
+        cl = ((uint64_t*) &l1[idx1 & 0xFFFF0])[0];
+        ch = ((uint64_t*) &l1[idx1 & 0xFFFF0])[1];
+        lo = _umul128(idx1, cl, &hi);
+
+        al1 += hi;
+        ah1 += lo;
+
+        ((uint64_t*) &l1[idx1 & 0xFFFF0])[0] = al1;
+        ((uint64_t*) &l1[idx1 & 0xFFFF0])[1] = ah1 ^ tweak1_2_1;
+
+        ah1 ^= ch;
+        al1 ^= cl;
+        idx1 = al1;
+    }
+
+    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
+    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
+
+    keccakf(h0, 24);
+    keccakf(h1, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, (char*) output + 32);
+}
--- a/algo/cryptonight-lite/cryptonight_lite_av4_softaes_double.c
+++ b/algo/cryptonight-lite/cryptonight_lite_av4_softaes_double.c
@ -1,111 +0,0 @@
-/* XMRig
- * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
- * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
- * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
- * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
- * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
- *
- *   This program is free software: you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, either version 3 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <x86intrin.h>
-#include <string.h>
-
-#include "algo/cryptonight/cryptonight.h"
-#include "cryptonight_lite_softaes.h"
-#include "crypto/c_keccak.h"
-
-
-void cryptonight_lite_av4_softaes_double(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx)
-{
-    keccak((const uint8_t *) input,        size, ctx->state0, 200);
-    keccak((const uint8_t *) input + size, size, ctx->state1, 200);
-
-    const uint8_t* l0 = ctx->memory;
-    const uint8_t* l1 = ctx->memory + MEMORY_LITE;
-    uint64_t* h0 = (uint64_t*) ctx->state0;
-    uint64_t* h1 = (uint64_t*) ctx->state1;
-
-    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
-    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
-
-    uint64_t al0 = h0[0] ^ h0[4];
-    uint64_t al1 = h1[0] ^ h1[4];
-    uint64_t ah0 = h0[1] ^ h0[5];
-    uint64_t ah1 = h1[1] ^ h1[5];
-
-    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-    __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
-
-    uint64_t idx0 = h0[0] ^ h0[4];
-    uint64_t idx1 = h1[0] ^ h1[4];
-
-    for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) {
-        __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]);
-        __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0xFFFF0]);
-
-        cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0));
-        cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1));
-
-        _mm_store_si128((__m128i *) &l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx0));
-        _mm_store_si128((__m128i *) &l1[idx1 & 0xFFFF0], _mm_xor_si128(bx1, cx1));
-
-        idx0 = EXTRACT64(cx0);
-        idx1 = EXTRACT64(cx1);
-
-        bx0 = cx0;
-        bx1 = cx1;
-
-        uint64_t hi, lo, cl, ch;
-        cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0];
-        ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1];
-        lo = _umul128(idx0, cl, &hi);
-
-        al0 += hi;
-        ah0 += lo;
-
-        ((uint64_t*) &l0[idx0 & 0xFFFF0])[0] = al0;
-        ((uint64_t*) &l0[idx0 & 0xFFFF0])[1] = ah0;
-
-        ah0 ^= ch;
-        al0 ^= cl;
-        idx0 = al0;
-
-        cl = ((uint64_t*) &l1[idx1 & 0xFFFF0])[0];
-        ch = ((uint64_t*) &l1[idx1 & 0xFFFF0])[1];
-        lo = _umul128(idx1, cl, &hi);
-
-        al1 += hi;
-        ah1 += lo;
-
-        ((uint64_t*) &l1[idx1 & 0xFFFF0])[0] = al1;
-        ((uint64_t*) &l1[idx1 & 0xFFFF0])[1] = ah1;
-
-        ah1 ^= ch;
-        al1 ^= cl;
-        idx1 = al1;
-    }
-
-    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
-    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
-
-    keccakf(h0, 24);
-    keccakf(h1, 24);
-
-    extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output);
-    extra_hashes[ctx->state1[0] & 3](ctx->state1, 200, (char*) output + 32);
-}
--- a/algo/cryptonight-lite/cryptonight_lite_softaes.h
+++ b/algo/cryptonight-lite/cryptonight_lite_softaes.h
@ -4,9 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -22,13 +22,15 @@
 *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef __CRYPTONIGHT_LITE_SOFTAES_H__
-#define __CRYPTONIGHT_LITE_SOFTAES_H__
+#ifndef XMRIG_CRYPTONIGHT_LITE_SOFTAES_H
+#define XMRIG_CRYPTONIGHT_LITE_SOFTAES_H
+

 #include <x86intrin.h>
+#include <stdint.h>
+

-extern __m128i soft_aesenc(__m128i in, __m128i key);
-extern __m128i soft_aeskeygenassist(__m128i key, uint8_t rcon);
+#include "crypto/soft_aes.h"


 // This will shift and xor tmp1 into itself as 4 32-bit vals such as
@ -234,4 +236,20 @@ static inline uint64_t _umul128(uint64_t multiplier, uint64_t multiplicand, uint
 #endif


-#endif /* __CRYPTONIGHT_LITE_SOFTAES_H__ */
+static inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
+{
+    mem_out[0] = EXTRACT64(tmp);
+
+    tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
+    uint64_t vh = EXTRACT64(tmp);
+
+    uint8_t x = vh >> 24;
+    static const uint16_t table = 0x7531;
+    const uint8_t index = (((x >> 3) & 6) | (x & 1)) << 1;
+    vh ^= ((table >> index) & 0x3) << 28;
+
+    mem_out[1] = vh;
+}
+
+
+#endif /* XMRIG_CRYPTONIGHT_LITE_SOFTAES_H */
--- a/algo/cryptonight/cryptonight.c
+++ b/algo/cryptonight/cryptonight.c
@ -4,8 +4,10 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -22,151 +24,287 @@
 */


+#include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 #include <mm_malloc.h>

+
 #ifndef BUILD_TEST
 #   include "xmrig.h"
 #endif

-#include "crypto/c_groestl.h"
+#include "cpu.h"
 #include "crypto/c_blake256.h"
+#include "crypto/c_groestl.h"
 #include "crypto/c_jh.h"
 #include "crypto/c_skein.h"
+#include "cryptonight_test.h"
 #include "cryptonight.h"
 #include "options.h"
+#include "persistent_memory.h"


-const static char test_input[152] = {
-    0x01, 0x00, 0xFB, 0x8E, 0x8A, 0xC8, 0x05, 0x89, 0x93, 0x23, 0x37, 0x1B, 0xB7, 0x90, 0xDB, 0x19,
-    0x21, 0x8A, 0xFD, 0x8D, 0xB8, 0xE3, 0x75, 0x5D, 0x8B, 0x90, 0xF3, 0x9B, 0x3D, 0x55, 0x06, 0xA9,
-    0xAB, 0xCE, 0x4F, 0xA9, 0x12, 0x24, 0x45, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x81, 0x46, 0xD4, 0x9F,
-    0xA9, 0x3E, 0xE7, 0x24, 0xDE, 0xB5, 0x7D, 0x12, 0xCB, 0xC6, 0xC6, 0xF3, 0xB9, 0x24, 0xD9, 0x46,
-    0x12, 0x7C, 0x7A, 0x97, 0x41, 0x8F, 0x93, 0x48, 0x82, 0x8F, 0x0F, 0x02,
-    0x03, 0x05, 0xA0, 0xDB, 0xD6, 0xBF, 0x05, 0xCF, 0x16, 0xE5, 0x03, 0xF3, 0xA6, 0x6F, 0x78, 0x00,
-    0x7C, 0xBF, 0x34, 0x14, 0x43, 0x32, 0xEC, 0xBF, 0xC2, 0x2E, 0xD9, 0x5C, 0x87, 0x00, 0x38, 0x3B,
-    0x30, 0x9A, 0xCE, 0x19, 0x23, 0xA0, 0x96, 0x4B, 0x00, 0x00, 0x00, 0x08, 0xBA, 0x93, 0x9A, 0x62,
-    0x72, 0x4C, 0x0D, 0x75, 0x81, 0xFC, 0xE5, 0x76, 0x1E, 0x9D, 0x8A, 0x0E, 0x6A, 0x1C, 0x3F, 0x92,
-    0x4F, 0xDD, 0x84, 0x93, 0xD1, 0x11, 0x56, 0x49, 0xC0, 0x5E, 0xB6, 0x01
-};
+static cn_hash_fun asm_func_map[AV_MAX][VARIANT_MAX][ASM_MAX] = {};


-const static char test_output0[64] = {
-    0x1B, 0x60, 0x6A, 0x3F, 0x4A, 0x07, 0xD6, 0x48, 0x9A, 0x1B, 0xCD, 0x07, 0x69, 0x7B, 0xD1, 0x66,
-    0x96, 0xB6, 0x1C, 0x8A, 0xE9, 0x82, 0xF6, 0x1A, 0x90, 0x16, 0x0F, 0x4E, 0x52, 0x82, 0x8A, 0x7F,
-    0x1A, 0x3F, 0xFB, 0xEE, 0x90, 0x9B, 0x42, 0x0D, 0x91, 0xF7, 0xBE, 0x6E, 0x5F, 0xB5, 0x6D, 0xB7,
-    0x1B, 0x31, 0x10, 0xD8, 0x86, 0x01, 0x1E, 0x87, 0x7E, 0xE5, 0x78, 0x6A, 0xFD, 0x08, 0x01, 0x00
-};
+void cryptonight_av1_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_av1_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_av1_v2(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_av2_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_av2_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_av2_v2(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_av3_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_av3_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_av3_v2(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_av4_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_av4_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_av4_v2(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);

+void cryptonight_r_av1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_r_av2(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_r_av3(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_r_av4(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);

-void cryptonight_av1_aesni(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx);
-void cryptonight_av2_aesni_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx);
-void cryptonight_av3_softaes(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx);
-void cryptonight_av4_softaes_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx);

 #ifndef XMRIG_NO_AEON
-const static char test_output1[64] = {
-    0x28, 0xA2, 0x2B, 0xAD, 0x3F, 0x93, 0xD1, 0x40, 0x8F, 0xCA, 0x47, 0x2E, 0xB5, 0xAD, 0x1C, 0xBE,
-    0x75, 0xF2, 0x1D, 0x05, 0x3C, 0x8C, 0xE5, 0xB3, 0xAF, 0x10, 0x5A, 0x57, 0x71, 0x3E, 0x21, 0xDD,
-    0x36, 0x95, 0xB4, 0xB5, 0x3B, 0xB0, 0x03, 0x58, 0xB0, 0xAD, 0x38, 0xDC, 0x16, 0x0F, 0xEB, 0x9E,
-    0x00, 0x4E, 0xEC, 0xE0, 0x9B, 0x83, 0xA7, 0x2E, 0xF6, 0xBA, 0x98, 0x64, 0xD3, 0x51, 0x0C, 0x88,
-};
-
-void cryptonight_lite_av1_aesni(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx);
-void cryptonight_lite_av2_aesni_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx);
-void cryptonight_lite_av3_softaes(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx);
-void cryptonight_lite_av4_softaes_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx);
+void cryptonight_lite_av1_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_lite_av1_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_lite_av2_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_lite_av2_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_lite_av3_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_lite_av3_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_lite_av4_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_lite_av4_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
 #endif

-void (*cryptonight_hash_ctx)(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx) = NULL;

+#ifndef XMRIG_NO_ASM
+void cryptonight_single_hash_asm_intel(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_single_hash_asm_ryzen(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_single_hash_asm_bulldozer(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_double_hash_asm(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);

-static bool self_test() {
-    if (cryptonight_hash_ctx == NULL) {
+void cryptonight_r_av1_asm_intel(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_r_av1_asm_bulldozer(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_r_av2_asm_intel(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+void cryptonight_r_av2_asm_bulldozer(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+#endif
+
+
+static inline bool verify(enum Variant variant, uint8_t *output, struct cryptonight_ctx **ctx, const uint8_t *referenceValue)
+{
+    cn_hash_fun func = cryptonight_hash_fn(opt_algo, opt_av, variant);
+    if (func == NULL) {
+        return false;
+    }
+
+    func(test_input, 76, output, ctx);
+
+    return memcmp(output, referenceValue, opt_double_hash ? 64 : 32) == 0;
+}
+
+
+static inline bool verify2(enum Variant variant, uint8_t *output, struct cryptonight_ctx **ctx, const uint8_t *referenceValue)
+{
+    cn_hash_fun func = cryptonight_hash_fn(opt_algo, opt_av, variant);
+    if (func == NULL) {
        return false;
    }

-    char output[64];
+    if (opt_double_hash) {
+        uint8_t input[128];

-    struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) _mm_malloc(sizeof(struct cryptonight_ctx), 16);
-    ctx->memory = (uint8_t *) _mm_malloc(MEMORY * 2, 16);
+        for (size_t i = 0; i < (sizeof(cn_r_test_input) / sizeof(cn_r_test_input[0])); ++i) {
+            const size_t size = cn_r_test_input[i].size;
+            memcpy(input,        cn_r_test_input[i].data, size);
+            memcpy(input + size, cn_r_test_input[i].data, size);

-    cryptonight_hash_ctx(test_input, 76, output, ctx);
+            ctx[0]->height = ctx[1]->height = cn_r_test_input[i].height;

-    _mm_free(ctx->memory);
-    _mm_free(ctx);
+            func(input, size, output, ctx);

-#   ifndef XMRIG_NO_AEON
-    if (opt_algo == ALGO_CRYPTONIGHT_LITE) {
-        return memcmp(output, test_output1, (opt_double_hash ? 64 : 32)) == 0;
+            if (memcmp(output, referenceValue + i * 32, 32) != 0 || memcmp(output + 32, referenceValue + i * 32, 32) != 0) {
+                return false;
+            }
+        }
+    }
+    else {
+        for (size_t i = 0; i < (sizeof(cn_r_test_input) / sizeof(cn_r_test_input[0])); ++i) {
+            ctx[0]->height = cn_r_test_input[i].height;
+
+            func(cn_r_test_input[i].data, cn_r_test_input[i].size, output, ctx);
+
+            if (memcmp(output, referenceValue + i * 32, 32) != 0) {
+                return false;
+            }
+        }
    }
-#   endif

-    return memcmp(output, test_output0, (opt_double_hash ? 64 : 32)) == 0;
+    return true;
 }


-#ifndef XMRIG_NO_AEON
-bool cryptonight_lite_init(int variant) {
-    switch (variant) {
-        case AEON_AV1_AESNI:
-            cryptonight_hash_ctx = cryptonight_lite_av1_aesni;
-            break;
+static bool self_test() {
+    struct cryptonight_ctx *ctx[2];
+    uint8_t output[64];

-        case AEON_AV2_AESNI_DOUBLE:
-            opt_double_hash = true;
-            cryptonight_hash_ctx = cryptonight_lite_av2_aesni_double;
-            break;
+    const size_t count = opt_double_hash ? 2 : 1;
+    const size_t size  = opt_algo == ALGO_CRYPTONIGHT ? MEMORY : MEMORY_LITE;
+    bool result = false;

-        case AEON_AV3_SOFT_AES:
-            cryptonight_hash_ctx = cryptonight_lite_av3_softaes;
-            break;
+    for (size_t i = 0; i < count; ++i) {
+        ctx[i]         = _mm_malloc(sizeof(struct cryptonight_ctx), 16);
+        ctx[i]->memory = _mm_malloc(size, 16);

-        case AEON_AV4_SOFT_AES_DOUBLE:
-            opt_double_hash = true;
-            cryptonight_hash_ctx = cryptonight_lite_av4_softaes_double;
-            break;
+        init_cn_r(ctx[i]);
+    }

-        default:
-            break;
+    if (opt_algo == ALGO_CRYPTONIGHT) {
+        result = verify(VARIANT_0,  output, ctx, test_output_v0) &&
+                 verify(VARIANT_1,  output, ctx, test_output_v1) &&
+                 verify(VARIANT_2,  output, ctx, test_output_v2) &&
+                 verify2(VARIANT_4, output, ctx, test_output_r);
+    }
+#   ifndef XMRIG_NO_AEON
+    else {
+        result = verify(VARIANT_0, output, ctx, test_output_v0_lite) &&
+                 verify(VARIANT_1, output, ctx, test_output_v1_lite);
    }
+#   endif

-    return self_test();
+
+    for (size_t i = 0; i < count; ++i) {
+        _mm_free(ctx[i]->memory);
+        _mm_free(ctx[i]);
+    }
+
+    return result;
+}
+
+
+#ifndef XMRIG_NO_ASM
+cn_hash_fun cryptonight_hash_asm_fn(enum AlgoVariant av, enum Variant variant, enum Assembly assembly)
+{
+    if (assembly == ASM_AUTO) {
+        assembly = (enum Assembly) cpu_info.assembly;
+    }
+
+    if (assembly == ASM_NONE) {
+        return NULL;
+    }
+
+    return asm_func_map[av][variant][assembly];
 }
 #endif


-bool cryptonight_init(int variant)
+cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum Variant variant)
 {
-#   ifndef XMRIG_NO_AEON
-    if (opt_algo == ALGO_CRYPTONIGHT_LITE) {
-        return cryptonight_lite_init(variant);
+    assert(av > AV_AUTO && av < AV_MAX);
+    assert(variant > VARIANT_AUTO && variant < VARIANT_MAX);
+
+#   ifndef XMRIG_NO_ASM
+    if (algorithm == ALGO_CRYPTONIGHT) {
+        cn_hash_fun fun = cryptonight_hash_asm_fn(av, variant, opt_assembly);
+        if (fun) {
+            return fun;
+        }
    }
 #   endif

-    switch (variant) {
-        case XMR_AV1_AESNI:
-            cryptonight_hash_ctx = cryptonight_av1_aesni;
-            break;
+    static const cn_hash_fun func_table[VARIANT_MAX * 4 * 2] = {
+        cryptonight_av1_v0,
+        cryptonight_av2_v0,
+        cryptonight_av3_v0,
+        cryptonight_av4_v0,
+        cryptonight_av1_v1,
+        cryptonight_av2_v1,
+        cryptonight_av3_v1,
+        cryptonight_av4_v1,
+        cryptonight_av1_v2,
+        cryptonight_av2_v2,
+        cryptonight_av3_v2,
+        cryptonight_av4_v2,
+
+        cryptonight_r_av1,
+        cryptonight_r_av2,
+        cryptonight_r_av3,
+        cryptonight_r_av4,
+
+#       ifndef XMRIG_NO_AEON
+        cryptonight_lite_av1_v0,
+        cryptonight_lite_av2_v0,
+        cryptonight_lite_av3_v0,
+        cryptonight_lite_av4_v0,
+        cryptonight_lite_av1_v1,
+        cryptonight_lite_av2_v1,
+        cryptonight_lite_av3_v1,
+        cryptonight_lite_av4_v1,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+#       else
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+#       endif
+    };
+
+#   ifndef NDEBUG
+    const size_t index = VARIANT_MAX * 4 * algorithm + 4 * variant + av - 1;
+
+    cn_hash_fun func = func_table[index];
+
+    assert(index < sizeof(func_table) / sizeof(func_table[0]));
+    assert(func != NULL);
+
+    return func;
+#   else
+    return func_table[VARIANT_MAX * 4 * algorithm + 4 * variant + av - 1];
+#   endif
+}

-        case XMR_AV2_AESNI_DOUBLE:
-            opt_double_hash = true;
-            cryptonight_hash_ctx = cryptonight_av2_aesni_double;
-            break;

-        case XMR_AV3_SOFT_AES:
-            cryptonight_hash_ctx = cryptonight_av3_softaes;
-            break;
+bool cryptonight_init(int av)
+{
+    opt_double_hash = av == AV_DOUBLE || av == AV_DOUBLE_SOFT;

-        case XMR_AV4_SOFT_AES_DOUBLE:
-            opt_double_hash = true;
-            cryptonight_hash_ctx = cryptonight_av4_softaes_double;
-            break;
+#   ifndef XMRIG_NO_ASM
+    asm_func_map[AV_SINGLE][VARIANT_2][ASM_INTEL]     = cryptonight_single_hash_asm_intel;
+    asm_func_map[AV_SINGLE][VARIANT_2][ASM_RYZEN]     = cryptonight_single_hash_asm_intel;
+    asm_func_map[AV_SINGLE][VARIANT_2][ASM_BULLDOZER] = cryptonight_single_hash_asm_bulldozer;

-        default:
-            break;
-    }
+    asm_func_map[AV_DOUBLE][VARIANT_2][ASM_INTEL]     = cryptonight_double_hash_asm;
+    asm_func_map[AV_DOUBLE][VARIANT_2][ASM_RYZEN]     = cryptonight_double_hash_asm;
+    asm_func_map[AV_DOUBLE][VARIANT_2][ASM_BULLDOZER] = cryptonight_double_hash_asm;
+
+    asm_func_map[AV_SINGLE][VARIANT_4][ASM_INTEL]     = cryptonight_r_av1_asm_intel;
+    asm_func_map[AV_SINGLE][VARIANT_4][ASM_RYZEN]     = cryptonight_r_av1_asm_intel;
+    asm_func_map[AV_SINGLE][VARIANT_4][ASM_BULLDOZER] = cryptonight_r_av1_asm_bulldozer;
+
+    asm_func_map[AV_DOUBLE][VARIANT_4][ASM_INTEL]     = cryptonight_r_av2_asm_intel;
+    asm_func_map[AV_DOUBLE][VARIANT_4][ASM_RYZEN]     = cryptonight_r_av2_asm_intel;
+    asm_func_map[AV_DOUBLE][VARIANT_4][ASM_BULLDOZER] = cryptonight_r_av2_asm_bulldozer;
+#   endif

    return self_test();
 }
@ -195,12 +333,36 @@ static inline void do_skein_hash(const void* input, size_t len, char* output) {
 void (* const extra_hashes[4])(const void *, size_t, char *) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash};


+static inline enum Variant cryptonight_variant(uint8_t version)
+{
+    if (opt_variant != VARIANT_AUTO) {
+        return opt_variant;
+    }
+
+    if (opt_algo == ALGO_CRYPTONIGHT_LITE) {
+        return VARIANT_1;
+    }
+
+    if (version >= 10) {
+        return VARIANT_4;
+    }
+
+    if (version >= 8) {
+        return VARIANT_2;
+    }
+
+    return version == 7 ? VARIANT_1 : VARIANT_0;
+}
+
+
 #ifndef BUILD_TEST
-int scanhash_cryptonight(int thr_id, uint32_t *hash, uint32_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx *restrict ctx) {
-    uint32_t *nonceptr = (uint32_t*) (((char*) blob) + 39);
+int scanhash_cryptonight(int thr_id, uint32_t *hash, uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx) {
+    uint32_t *nonceptr   = (uint32_t*) (((char*) blob) + 39);
+    enum Variant variant = cryptonight_variant(blob[0]);

    do {
-        cryptonight_hash_ctx(blob, blob_size, hash, ctx);
+        cryptonight_hash_fn(opt_algo, opt_av, variant)(blob, blob_size, (uint8_t *) hash, ctx);
+
        (*hashes_done)++;

        if (unlikely(hash[7] < target)) {
@ -214,13 +376,14 @@ int scanhash_cryptonight(int thr_id, uint32_t *hash, uint32_t *restrict blob, si
 }


-int scanhash_cryptonight_double(int thr_id, uint32_t *hash, uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx *restrict ctx) {
-    int rc = 0;
-    uint32_t *nonceptr0 = (uint32_t*) (((char*) blob) + 39);
-    uint32_t *nonceptr1 = (uint32_t*) (((char*) blob) + 39 + blob_size);
+int scanhash_cryptonight_double(int thr_id, uint32_t *hash, uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx) {
+    int rc               = 0;
+    uint32_t *nonceptr0  = (uint32_t*) (((char*) blob) + 39);
+    uint32_t *nonceptr1  = (uint32_t*) (((char*) blob) + 39 + blob_size);
+    enum Variant variant = cryptonight_variant(blob[0]);

    do {
-        cryptonight_hash_ctx(blob, blob_size, hash, ctx);
+        cryptonight_hash_fn(opt_algo, opt_av, variant)(blob, blob_size, (uint8_t *) hash, ctx);
        (*hashes_done) += 2;

        if (unlikely(hash[7] < target)) {
--- a/algo/cryptonight/cryptonight.h
+++ b/algo/cryptonight/cryptonight.h
@ -4,8 +4,10 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -21,27 +23,59 @@
 *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef __CRYPTONIGHT_H__
-#define __CRYPTONIGHT_H__
+#ifndef XMRIG_CRYPTONIGHT_H
+#define XMRIG_CRYPTONIGHT_H
+

 #include <stddef.h>
 #include <stdint.h>
 #include <stdbool.h>

+
+#include "options.h"
+
+
 #define MEMORY      2097152 /* 2 MiB */
 #define MEMORY_LITE 1048576 /* 1 MiB */

+
+#if defined _MSC_VER || defined XMRIG_ARM
+#define ABI_ATTRIBUTE
+#else
+#define ABI_ATTRIBUTE __attribute__((ms_abi))
+#endif
+
+
+struct cryptonight_ctx;
+typedef void(*cn_mainloop_fun_ms_abi)(struct cryptonight_ctx*) ABI_ATTRIBUTE;
+typedef void(*cn_mainloop_double_fun_ms_abi)(struct cryptonight_ctx*, struct cryptonight_ctx*) ABI_ATTRIBUTE;
+
+
 struct cryptonight_ctx {
-    uint8_t state0[200] __attribute__((aligned(16)));
-    uint8_t state1[200] __attribute__((aligned(16)));
-    uint8_t* memory     __attribute__((aligned(16)));
+    uint8_t state[224] __attribute__((aligned(16)));
+    uint8_t *memory    __attribute__((aligned(16)));
+
+    uint8_t unused[40];
+    const uint32_t *saes_table;
+
+    cn_mainloop_fun_ms_abi generated_code;
+    cn_mainloop_double_fun_ms_abi generated_code_double;
+    uint64_t generated_code_height;
+    uint64_t generated_code_double_height;
+    uint64_t height;
 };


+typedef void (*cn_hash_fun)(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
+
+
 extern void (* const extra_hashes[4])(const void *, size_t, char *);

-bool cryptonight_init(int variant);
-int scanhash_cryptonight(int thr_id, uint32_t *hash, uint32_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx *restrict ctx);
-int scanhash_cryptonight_double(int thr_id, uint32_t *hash, uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx *restrict ctx);
+cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum Variant variant);
+
+bool cryptonight_init(int av);
+int scanhash_cryptonight(int thr_id, uint32_t *hash, uint8_t *blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *hashes_done, struct cryptonight_ctx **ctx);
+int scanhash_cryptonight_double(int thr_id, uint32_t *hash, uint8_t *blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *hashes_done, struct cryptonight_ctx **ctx);
+

-#endif /* __CRYPTONIGHT_H__ */
+#endif /* XMRIG_CRYPTONIGHT_H */
--- a/algo/cryptonight/cryptonight_aesni.h
+++ b/algo/cryptonight/cryptonight_aesni.h
@ -22,10 +22,12 @@
 *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef __CRYPTONIGHT_AESNI_H__
-#define __CRYPTONIGHT_AESNI_H__
+#ifndef XMRIG_CRYPTONIGHT_AESNI_H
+#define XMRIG_CRYPTONIGHT_AESNI_H
+

 #include <x86intrin.h>
+#include <stdint.h>


 #define aes_genkey_sub(imm8) \
@ -253,4 +255,20 @@ static inline uint64_t _umul128(uint64_t multiplier, uint64_t multiplicand, uint
 #endif


-#endif /* __CRYPTONIGHT_AESNI_H__ */
+static inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
+{
+    mem_out[0] = EXTRACT64(tmp);
+
+    tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
+    uint64_t vh = EXTRACT64(tmp);
+
+    uint8_t x = vh >> 24;
+    static const uint16_t table = 0x7531;
+    const uint8_t index = (((x >> 3) & 6) | (x & 1)) << 1;
+    vh ^= ((table >> index) & 0x3) << 28;
+
+    mem_out[1] = vh;
+}
+
+
+#endif /* XMRIG_CRYPTONIGHT_AESNI_H */
--- a/algo/cryptonight/cryptonight_av1.c
+++ b/algo/cryptonight/cryptonight_av1.c
@ -0,0 +1,261 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2016-2018 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <x86intrin.h>
+#include <string.h>
+
+#include "crypto/c_keccak.h"
+#include "cryptonight.h"
+#include "cryptonight_aesni.h"
+#include "cryptonight_monero.h"
+
+
+void cryptonight_av1_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input, size, ctx[0]->state, 200);
+
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx;
+        cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
+
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
+        idx0 = EXTRACT64(cx);
+        bx0 = cx;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+    }
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+
+    keccakf(h0, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+
+
+void cryptonight_av1_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    if (size < 43) {
+        memset(output, 0, 32);
+        return;
+    }
+
+    keccak(input, size, ctx[0]->state, 200);
+
+    VARIANT1_INIT(0);
+
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx;
+        cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
+
+        cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
+
+        idx0 = EXTRACT64(cx);
+        bx0 = cx;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0 ^ tweak1_2_0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+    }
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+
+    keccakf(h0, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+
+
+void cryptonight_av1_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input, size, ctx[0]->state, 200);
+
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+
+    VARIANT2_INIT(0);
+    VARIANT2_SET_ROUNDING_MODE();
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+
+    uint64_t idx0 = al0;
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx        = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+
+        cx = _mm_aesenc_si128(cx, ax0);
+
+        VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1);
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
+
+        idx0 = _mm_cvtsi128_si64(cx);
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+
+        VARIANT2_INTEGER_MATH(0, cl, cx);
+        lo = _umul128(idx0, cl, &hi);
+        VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, hi, lo);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        al0 ^= cl;
+        ah0 ^= ch;
+        idx0 = al0;
+
+        bx1 = bx0;
+        bx0 = cx;
+    }
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+
+    keccakf(h0, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+
+
+#ifndef XMRIG_NO_ASM
+extern void cnv2_mainloop_ivybridge_asm(struct cryptonight_ctx *ctx);
+extern void cnv2_mainloop_ryzen_asm(struct cryptonight_ctx *ctx);
+extern void cnv2_mainloop_bulldozer_asm(struct cryptonight_ctx *ctx);
+extern void cnv2_double_mainloop_sandybridge_asm(struct cryptonight_ctx* ctx0, struct cryptonight_ctx* ctx1);
+
+
+void cryptonight_single_hash_asm_intel(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input, size, ctx[0]->state, 200);
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    cnv2_mainloop_ivybridge_asm(ctx[0]);
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+    keccakf((uint64_t*) ctx[0]->state, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+
+
+void cryptonight_single_hash_asm_ryzen(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input, size, ctx[0]->state, 200);
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    cnv2_mainloop_ryzen_asm(ctx[0]);
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+    keccakf((uint64_t*) ctx[0]->state, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+
+
+void cryptonight_single_hash_asm_bulldozer(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input, size, ctx[0]->state, 200);
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    cnv2_mainloop_bulldozer_asm(ctx[0]);
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+    keccakf((uint64_t*) ctx[0]->state, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+
+
+void cryptonight_double_hash_asm(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+    cn_explode_scratchpad((__m128i*) ctx[1]->state, (__m128i*) ctx[1]->memory);
+
+    cnv2_double_mainloop_sandybridge_asm(ctx[0], ctx[1]);
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+    cn_implode_scratchpad((__m128i*) ctx[1]->memory, (__m128i*) ctx[1]->state);
+
+    keccakf((uint64_t*) ctx[0]->state, 24);
+    keccakf((uint64_t*) ctx[1]->state, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
+#endif
--- a/algo/cryptonight/cryptonight_av1_aesni.c
+++ b/algo/cryptonight/cryptonight_av1_aesni.c
@ -1,77 +0,0 @@
-/* XMRig
- * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
- * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
- * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
- * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
- * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
- *
- *   This program is free software: you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, either version 3 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <x86intrin.h>
-#include <string.h>
-
-#include "cryptonight.h"
-#include "cryptonight_aesni.h"
-#include "crypto/c_keccak.h"
-
-
-void cryptonight_av1_aesni(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx)
-{
-    keccak((const uint8_t *) input, size, ctx->state0, 200);
-
-    cn_explode_scratchpad((__m128i*) ctx->state0, (__m128i*) ctx->memory);
-
-    const uint8_t* l0 = ctx->memory;
-    uint64_t* h0 = (uint64_t*) ctx->state0;
-
-    uint64_t al0 = h0[0] ^ h0[4];
-    uint64_t ah0 = h0[1] ^ h0[5];
-    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-
-    uint64_t idx0 = h0[0] ^ h0[4];
-
-    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
-        __m128i cx;
-        cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
-        cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
-
-        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
-        idx0 = EXTRACT64(cx);
-        bx0 = cx;
-
-        uint64_t hi, lo, cl, ch;
-        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
-        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
-        lo = _umul128(idx0, cl, &hi);
-
-        al0 += hi;
-        ah0 += lo;
-
-        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
-        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
-
-        ah0 ^= ch;
-        al0 ^= cl;
-        idx0 = al0;
-    }
-
-    cn_implode_scratchpad((__m128i*) ctx->memory, (__m128i*) ctx->state0);
-
-    keccakf(h0, 24);
-    extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output);
-}
--- a/algo/cryptonight/cryptonight_av2.c
+++ b/algo/cryptonight/cryptonight_av2.c
@ -0,0 +1,304 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2016-2018 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <x86intrin.h>
+#include <string.h>
+
+#include "crypto/c_keccak.h"
+#include "cryptonight.h"
+#include "cryptonight_aesni.h"
+#include "cryptonight_monero.h"
+
+
+void cryptonight_av2_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    const uint8_t* l1 = ctx[1]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+    uint64_t* h1 = (uint64_t*) ctx[1]->state;
+
+    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
+    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t al1 = h1[0] ^ h1[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    uint64_t ah1 = h1[1] ^ h1[5];
+
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+    uint64_t idx1 = h1[0] ^ h1[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
+
+        cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
+        cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
+
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0));
+        _mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1));
+
+        idx0 = EXTRACT64(cx0);
+        idx1 = EXTRACT64(cx1);
+
+        bx0 = cx0;
+        bx1 = cx1;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+
+        cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
+        lo = _umul128(idx1, cl, &hi);
+
+        al1 += hi;
+        ah1 += lo;
+
+        ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1;
+        ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1;
+
+        ah1 ^= ch;
+        al1 ^= cl;
+        idx1 = al1;
+    }
+
+    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
+    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
+
+    keccakf(h0, 24);
+    keccakf(h1, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
+
+
+void cryptonight_av2_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    if (size < 43) {
+        memset(output, 0, 64);
+        return;
+    }
+
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+
+    VARIANT1_INIT(0);
+    VARIANT1_INIT(1);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    const uint8_t* l1 = ctx[1]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+    uint64_t* h1 = (uint64_t*) ctx[1]->state;
+
+    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
+    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t al1 = h1[0] ^ h1[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    uint64_t ah1 = h1[1] ^ h1[5];
+
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+    uint64_t idx1 = h1[0] ^ h1[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
+
+        cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
+        cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
+
+        cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0));
+        cryptonight_monero_tweak((uint64_t*)&l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1));
+
+        idx0 = EXTRACT64(cx0);
+        idx1 = EXTRACT64(cx1);
+
+        bx0 = cx0;
+        bx1 = cx1;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0 ^ tweak1_2_0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+
+        cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
+        lo = _umul128(idx1, cl, &hi);
+
+        al1 += hi;
+        ah1 += lo;
+
+        ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1;
+        ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1 ^ tweak1_2_1;
+
+        ah1 ^= ch;
+        al1 ^= cl;
+        idx1 = al1;
+    }
+
+    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
+    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
+
+    keccakf(h0, 24);
+    keccakf(h1, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
+
+
+void cryptonight_av2_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    const uint8_t* l1 = ctx[1]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+    uint64_t* h1 = (uint64_t*) ctx[1]->state;
+
+    VARIANT2_INIT(0);
+    VARIANT2_INIT(1);
+    VARIANT2_SET_ROUNDING_MODE();
+
+    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
+    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t al1 = h1[0] ^ h1[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    uint64_t ah1 = h1[1] ^ h1[5];
+
+    __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx01 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+    __m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+    __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]);
+
+    uint64_t idx0 = al0;
+    uint64_t idx1 = al1;
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx0       = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        __m128i cx1       = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
+
+        const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+        const __m128i ax1 = _mm_set_epi64x(ah1, al1);
+
+        cx0 = _mm_aesenc_si128(cx0, ax0);
+        cx1 = _mm_aesenc_si128(cx1, ax1);
+
+        VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01);
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx00, cx0));
+
+        VARIANT2_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11);
+        _mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx10, cx1));
+
+        idx0 = _mm_cvtsi128_si64(cx0);
+        idx1 = _mm_cvtsi128_si64(cx1);
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+
+        VARIANT2_INTEGER_MATH(0, cl, cx0);
+        lo = _umul128(idx0, cl, &hi);
+        VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, hi, lo);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        al0 ^= cl;
+        ah0 ^= ch;
+        idx0 = al0;
+
+        cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
+
+        VARIANT2_INTEGER_MATH(1, cl, cx1);
+        lo = _umul128(idx1, cl, &hi);
+        VARIANT2_SHUFFLE2(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, hi, lo);
+
+        al1 += hi;
+        ah1 += lo;
+
+        ((uint64_t*)&l1[idx1 & 0x1FFFF0])[0] = al1;
+        ((uint64_t*)&l1[idx1 & 0x1FFFF0])[1] = ah1;
+
+        al1 ^= cl;
+        ah1 ^= ch;
+        idx1 = al1;
+
+        bx01 = bx00;
+        bx11 = bx10;
+
+        bx00 = cx0;
+        bx10 = cx1;
+    }
+
+    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
+    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
+
+    keccakf(h0, 24);
+    keccakf(h1, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
--- a/algo/cryptonight/cryptonight_av2_aesni_double.c
+++ b/algo/cryptonight/cryptonight_av2_aesni_double.c
@ -1,111 +0,0 @@
-/* XMRig
- * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
- * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
- * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
- * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
- * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
- *
- *   This program is free software: you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, either version 3 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <x86intrin.h>
-#include <string.h>
-
-#include "cryptonight.h"
-#include "cryptonight_aesni.h"
-#include "crypto/c_keccak.h"
-
-
-void cryptonight_av2_aesni_double(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx)
-{
-    keccak((const uint8_t *) input,        size, ctx->state0, 200);
-    keccak((const uint8_t *) input + size, size, ctx->state1, 200);
-
-    const uint8_t* l0 = ctx->memory;
-    const uint8_t* l1 = ctx->memory + MEMORY;
-    uint64_t* h0 = (uint64_t*) ctx->state0;
-    uint64_t* h1 = (uint64_t*) ctx->state1;
-
-    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
-    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
-
-    uint64_t al0 = h0[0] ^ h0[4];
-    uint64_t al1 = h1[0] ^ h1[4];
-    uint64_t ah0 = h0[1] ^ h0[5];
-    uint64_t ah1 = h1[1] ^ h1[5];
-
-    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-    __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
-
-    uint64_t idx0 = h0[0] ^ h0[4];
-    uint64_t idx1 = h1[0] ^ h1[4];
-
-    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
-        __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
-        __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
-
-        cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
-        cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
-
-        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0));
-        _mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1));
-
-        idx0 = EXTRACT64(cx0);
-        idx1 = EXTRACT64(cx1);
-
-        bx0 = cx0;
-        bx1 = cx1;
-
-        uint64_t hi, lo, cl, ch;
-        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
-        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
-        lo = _umul128(idx0, cl, &hi);
-
-        al0 += hi;
-        ah0 += lo;
-
-        ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0;
-        ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0;
-
-        ah0 ^= ch;
-        al0 ^= cl;
-        idx0 = al0;
-
-        cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
-        ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
-        lo = _umul128(idx1, cl, &hi);
-
-        al1 += hi;
-        ah1 += lo;
-
-        ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1;
-        ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1;
-
-        ah1 ^= ch;
-        al1 ^= cl;
-        idx1 = al1;
-    }
-
-    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
-    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
-
-    keccakf(h0, 24);
-    keccakf(h1, 24);
-
-    extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output);
-    extra_hashes[ctx->state1[0] & 3](ctx->state1, 200, (char*) output + 32);
-}
--- a/algo/cryptonight/cryptonight_av3.c
+++ b/algo/cryptonight/cryptonight_av3.c
@ -0,0 +1,193 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2016-2018 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <x86intrin.h>
+#include <string.h>
+
+#include "crypto/c_keccak.h"
+#include "cryptonight.h"
+#include "cryptonight_monero.h"
+#include "cryptonight_softaes.h"
+
+
+void cryptonight_av3_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input, size, ctx[0]->state, 200);
+
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx;
+        cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0));
+
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
+        idx0 = EXTRACT64(cx);
+        bx0 = cx;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+    }
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+
+    keccakf(h0, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+
+
+void cryptonight_av3_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    if (size < 43) {
+        memset(output, 0, 32);
+        return;
+    }
+
+    keccak(input, size, ctx[0]->state, 200);
+
+    VARIANT1_INIT(0);
+
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx;
+        cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0));
+
+        cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
+
+        idx0 = EXTRACT64(cx);
+        bx0 = cx;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0 ^ tweak1_2_0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+    }
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+
+    keccakf(h0, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+
+
+void cryptonight_av3_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input, size, ctx[0]->state, 200);
+
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+
+    VARIANT2_INIT(0);
+    VARIANT2_SET_ROUNDING_MODE();
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+
+    uint64_t idx0 = al0;
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx        = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+
+        cx = soft_aesenc(cx, ax0);
+
+        VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1);
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
+
+        idx0 = _mm_cvtsi128_si64(cx);
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+
+        VARIANT2_INTEGER_MATH(0, cl, cx);
+        lo = _umul128(idx0, cl, &hi);
+        VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, hi, lo);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        al0 ^= cl;
+        ah0 ^= ch;
+        idx0 = al0;
+
+        bx1 = bx0;
+        bx0 = cx;
+    }
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+
+    keccakf(h0, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
--- a/algo/cryptonight/cryptonight_av3_softaes.c
+++ b/algo/cryptonight/cryptonight_av3_softaes.c
@ -1,77 +0,0 @@
-/* XMRig
- * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
- * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
- * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
- * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
- * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
- *
- *   This program is free software: you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, either version 3 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <x86intrin.h>
-#include <string.h>
-
-#include "cryptonight.h"
-#include "cryptonight_softaes.h"
-#include "crypto/c_keccak.h"
-
-
-void cryptonight_av3_softaes(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx)
-{
-    keccak((const uint8_t *) input, size, ctx->state0, 200);
-
-    cn_explode_scratchpad((__m128i*) ctx->state0, (__m128i*) ctx->memory);
-
-    const uint8_t* l0 = ctx->memory;
-    uint64_t* h0 = (uint64_t*) ctx->state0;
-
-    uint64_t al0 = h0[0] ^ h0[4];
-    uint64_t ah0 = h0[1] ^ h0[5];
-    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-
-    uint64_t idx0 = h0[0] ^ h0[4];
-
-    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
-        __m128i cx;
-        cx = _mm_load_si128((__m128i *)&l0[idx0 & 0x1FFFF0]);
-        cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0));
-
-        _mm_store_si128((__m128i *)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
-        idx0 = EXTRACT64(cx);
-        bx0 = cx;
-
-        uint64_t hi, lo, cl, ch;
-        cl = ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0];
-        ch = ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1];
-        lo = _umul128(idx0, cl, &hi);
-
-        al0 += hi;
-        ah0 += lo;
-
-        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
-        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
-
-        ah0 ^= ch;
-        al0 ^= cl;
-        idx0 = al0;
-    }
-
-    cn_implode_scratchpad((__m128i*) ctx->memory, (__m128i*) ctx->state0);
-
-    keccakf(h0, 24);
-    extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output);
-}
--- a/algo/cryptonight/cryptonight_av4.c
+++ b/algo/cryptonight/cryptonight_av4.c
@ -0,0 +1,304 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2016-2018 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <x86intrin.h>
+#include <string.h>
+
+#include "crypto/c_keccak.h"
+#include "cryptonight.h"
+#include "cryptonight_monero.h"
+#include "cryptonight_softaes.h"
+
+
+void cryptonight_av4_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    const uint8_t* l1 = ctx[1]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+    uint64_t* h1 = (uint64_t*) ctx[1]->state;
+
+    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
+    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t al1 = h1[0] ^ h1[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    uint64_t ah1 = h1[1] ^ h1[5];
+
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+    uint64_t idx1 = h1[0] ^ h1[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
+
+        cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0));
+        cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1));
+
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0));
+        _mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1));
+
+        idx0 = EXTRACT64(cx0);
+        idx1 = EXTRACT64(cx1);
+
+        bx0 = cx0;
+        bx1 = cx1;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+
+        cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
+        lo = _umul128(idx1, cl, &hi);
+
+        al1 += hi;
+        ah1 += lo;
+
+        ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1;
+        ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1;
+
+        ah1 ^= ch;
+        al1 ^= cl;
+        idx1 = al1;
+    }
+
+    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
+    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
+
+    keccakf(h0, 24);
+    keccakf(h1, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
+
+
+void cryptonight_av4_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    if (size < 43) {
+        memset(output, 0, 64);
+        return;
+    }
+
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+
+    VARIANT1_INIT(0);
+    VARIANT1_INIT(1);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    const uint8_t* l1 = ctx[1]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+    uint64_t* h1 = (uint64_t*) ctx[1]->state;
+
+    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
+    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t al1 = h1[0] ^ h1[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    uint64_t ah1 = h1[1] ^ h1[5];
+
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+
+    uint64_t idx0 = h0[0] ^ h0[4];
+    uint64_t idx1 = h1[0] ^ h1[4];
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
+
+        cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0));
+        cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1));
+
+        cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0));
+        cryptonight_monero_tweak((uint64_t*)&l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1));
+
+        idx0 = EXTRACT64(cx0);
+        idx1 = EXTRACT64(cx1);
+
+        bx0 = cx0;
+        bx1 = cx1;
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+        lo = _umul128(idx0, cl, &hi);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0 ^ tweak1_2_0;
+
+        ah0 ^= ch;
+        al0 ^= cl;
+        idx0 = al0;
+
+        cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
+        lo = _umul128(idx1, cl, &hi);
+
+        al1 += hi;
+        ah1 += lo;
+
+        ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1;
+        ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1 ^ tweak1_2_1;
+
+        ah1 ^= ch;
+        al1 ^= cl;
+        idx1 = al1;
+    }
+
+    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
+    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
+
+    keccakf(h0, 24);
+    keccakf(h1, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
+
+
+void cryptonight_av4_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    const uint8_t* l1 = ctx[1]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+    uint64_t* h1 = (uint64_t*) ctx[1]->state;
+
+    VARIANT2_INIT(0);
+    VARIANT2_INIT(1);
+    VARIANT2_SET_ROUNDING_MODE();
+
+    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
+    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t al1 = h1[0] ^ h1[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    uint64_t ah1 = h1[1] ^ h1[5];
+
+    __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx01 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+    __m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+    __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]);
+
+    uint64_t idx0 = al0;
+    uint64_t idx1 = al1;
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx0       = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        __m128i cx1       = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
+
+        const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+        const __m128i ax1 = _mm_set_epi64x(ah1, al1);
+
+        cx0 = soft_aesenc(cx0, ax0);
+        cx1 = soft_aesenc(cx1, ax1);
+
+        VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01);
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx00, cx0));
+
+        VARIANT2_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11);
+        _mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx10, cx1));
+
+        idx0 = _mm_cvtsi128_si64(cx0);
+        idx1 = _mm_cvtsi128_si64(cx1);
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+
+        VARIANT2_INTEGER_MATH(0, cl, cx0);
+        lo = _umul128(idx0, cl, &hi);
+        VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, hi, lo);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        al0 ^= cl;
+        ah0 ^= ch;
+        idx0 = al0;
+
+        cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
+
+        VARIANT2_INTEGER_MATH(1, cl, cx1);
+        lo = _umul128(idx1, cl, &hi);
+        VARIANT2_SHUFFLE2(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, hi, lo);
+
+        al1 += hi;
+        ah1 += lo;
+
+        ((uint64_t*)&l1[idx1 & 0x1FFFF0])[0] = al1;
+        ((uint64_t*)&l1[idx1 & 0x1FFFF0])[1] = ah1;
+
+        al1 ^= cl;
+        ah1 ^= ch;
+        idx1 = al1;
+
+        bx01 = bx00;
+        bx11 = bx10;
+
+        bx00 = cx0;
+        bx10 = cx1;
+    }
+
+    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
+    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
+
+    keccakf(h0, 24);
+    keccakf(h1, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
--- a/algo/cryptonight/cryptonight_av4_softaes_double.c
+++ b/algo/cryptonight/cryptonight_av4_softaes_double.c
@ -1,111 +0,0 @@
-/* XMRig
- * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
- * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
- * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
- * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
- * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
- *
- *   This program is free software: you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation, either version 3 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <x86intrin.h>
-#include <string.h>
-
-#include "cryptonight.h"
-#include "cryptonight_softaes.h"
-#include "crypto/c_keccak.h"
-
-
-void cryptonight_av4_softaes_double(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx)
-{
-    keccak((const uint8_t *) input,        size, ctx->state0, 200);
-    keccak((const uint8_t *) input + size, size, ctx->state1, 200);
-
-    const uint8_t* l0 = ctx->memory;
-    const uint8_t* l1 = ctx->memory + MEMORY;
-    uint64_t* h0 = (uint64_t*) ctx->state0;
-    uint64_t* h1 = (uint64_t*) ctx->state1;
-
-    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
-    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
-
-    uint64_t al0 = h0[0] ^ h0[4];
-    uint64_t al1 = h1[0] ^ h1[4];
-    uint64_t ah0 = h0[1] ^ h0[5];
-    uint64_t ah1 = h1[1] ^ h1[5];
-
-    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-    __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
-
-    uint64_t idx0 = h0[0] ^ h0[4];
-    uint64_t idx1 = h1[0] ^ h1[4];
-
-    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
-        __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
-        __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
-
-        cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0));
-        cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1));
-
-        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0));
-        _mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1));
-
-        idx0 = EXTRACT64(cx0);
-        idx1 = EXTRACT64(cx1);
-
-        bx0 = cx0;
-        bx1 = cx1;
-
-        uint64_t hi, lo, cl, ch;
-        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
-        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
-        lo = _umul128(idx0, cl, &hi);
-
-        al0 += hi;
-        ah0 += lo;
-
-        ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0;
-        ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0;
-
-        ah0 ^= ch;
-        al0 ^= cl;
-        idx0 = al0;
-
-        cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
-        ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
-        lo = _umul128(idx1, cl, &hi);
-
-        al1 += hi;
-        ah1 += lo;
-
-        ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1;
-        ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1;
-
-        ah1 ^= ch;
-        al1 ^= cl;
-        idx1 = al1;
-    }
-
-    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
-    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
-
-    keccakf(h0, 24);
-    keccakf(h1, 24);
-
-    extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output);
-    extra_hashes[ctx->state1[0] & 3](ctx->state1, 200, (char*) output + 32);
-}
--- a/algo/cryptonight/cryptonight_monero.h
+++ b/algo/cryptonight/cryptonight_monero.h
@ -0,0 +1,150 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef XMRIG_CRYPTONIGHT_MONERO_H
+#define XMRIG_CRYPTONIGHT_MONERO_H
+
+
+#include <fenv.h>
+#include <math.h>
+#include <stdint.h>
+#include <x86intrin.h>
+
+
+static inline __m128i int_sqrt_v2(const uint64_t n0)
+{
+    __m128d x = _mm_castsi128_pd(_mm_add_epi64(_mm_cvtsi64_si128(n0 >> 12), _mm_set_epi64x(0, 1023ULL << 52)));
+    x = _mm_sqrt_sd(_mm_setzero_pd(), x);
+    uint64_t r = (uint64_t)(_mm_cvtsi128_si64(_mm_castpd_si128(x)));
+
+    const uint64_t s = r >> 20;
+    r >>= 19;
+
+    uint64_t x2 = (s - (1022ULL << 32)) * (r - s - (1022ULL << 32) + 1);
+#   if (defined(_MSC_VER) || __GNUC__ > 7 || (__GNUC__ == 7 && __GNUC_MINOR__ > 1)) && (defined(__x86_64__) || defined(_M_AMD64))
+    _addcarry_u64(_subborrow_u64(0, x2, n0, (unsigned long long int*)&x2), r, 0, (unsigned long long int*)&r);
+#   else
+    if (x2 < n0) ++r;
+#   endif
+
+    return _mm_cvtsi64_si128(r);
+}
+
+
+#   define VARIANT1_INIT(part) \
+    uint64_t tweak1_2_##part = (*(const uint64_t*)(input + 35 + part * size) ^ \
+                               *((const uint64_t*)(ctx[part]->state) + 24)); \
+
+#   define VARIANT2_INIT(part) \
+    __m128i division_result_xmm_##part = _mm_cvtsi64_si128(h##part[12]); \
+    __m128i sqrt_result_xmm_##part = _mm_cvtsi64_si128(h##part[13]);
+
+#ifdef _MSC_VER
+#   define VARIANT2_SET_ROUNDING_MODE() { _control87(RC_DOWN, MCW_RC); }
+#else
+#   define VARIANT2_SET_ROUNDING_MODE() { fesetround(FE_DOWNWARD); }
+#endif
+
+#   define VARIANT2_INTEGER_MATH(part, cl, cx) \
+    { \
+        const uint64_t sqrt_result = (uint64_t)(_mm_cvtsi128_si64(sqrt_result_xmm_##part)); \
+        const uint64_t cx_0 = _mm_cvtsi128_si64(cx); \
+        cl ^= (uint64_t)(_mm_cvtsi128_si64(division_result_xmm_##part)) ^ (sqrt_result << 32); \
+        const uint32_t d = (uint32_t)(cx_0 + (sqrt_result << 1)) | 0x80000001UL; \
+        const uint64_t cx_1 = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \
+        const uint64_t division_result = (uint32_t)(cx_1 / d) + ((cx_1 % d) << 32); \
+        division_result_xmm_##part = _mm_cvtsi64_si128((int64_t)(division_result)); \
+        sqrt_result_xmm_##part = int_sqrt_v2(cx_0 + division_result); \
+    }
+
+#   define VARIANT2_SHUFFLE(base_ptr, offset, _a, _b, _b1) \
+    { \
+        const __m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))); \
+        const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \
+        const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30))); \
+        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \
+        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \
+        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \
+    }
+
+#   define VARIANT4_SHUFFLE(base_ptr, offset, _a, _b, _b1, _c) \
+    { \
+        const __m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))); \
+        const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \
+        const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30))); \
+        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \
+        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \
+        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \
+        _c = _mm_xor_si128(_mm_xor_si128(_c, chunk3), _mm_xor_si128(chunk1, chunk2)); \
+    }
+
+#   define VARIANT2_SHUFFLE2(base_ptr, offset, _a, _b, _b1, hi, lo) \
+    { \
+        const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))), _mm_set_epi64x(lo, hi)); \
+        const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \
+        hi ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[0]; \
+        lo ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[1]; \
+        const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30))); \
+        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \
+        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \
+        _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \
+    }
+
+
+#ifndef NOINLINE
+#ifdef __GNUC__
+#define NOINLINE __attribute__ ((noinline))
+#elif _MSC_VER
+#define NOINLINE __declspec(noinline)
+#else
+#define NOINLINE
+#endif
+#endif
+
+#include "variant4_random_math.h"
+
+#define VARIANT4_RANDOM_MATH_INIT(part) \
+  uint32_t r##part[9]; \
+  struct V4_Instruction code##part[256]; \
+  { \
+    r##part[0] = (uint32_t)(h##part[12]); \
+    r##part[1] = (uint32_t)(h##part[12] >> 32); \
+    r##part[2] = (uint32_t)(h##part[13]); \
+    r##part[3] = (uint32_t)(h##part[13] >> 32); \
+  } \
+  v4_random_math_init(code##part, ctx[part]->height);
+
+#define VARIANT4_RANDOM_MATH(part, al, ah, cl, bx0, bx1) \
+  { \
+    cl ^= (r##part[0] + r##part[1]) | ((uint64_t)(r##part[2] + r##part[3]) << 32); \
+    r##part[4] = (uint32_t)(al); \
+    r##part[5] = (uint32_t)(ah); \
+    r##part[6] = (uint32_t)(_mm_cvtsi128_si32(bx0)); \
+    r##part[7] = (uint32_t)(_mm_cvtsi128_si32(bx1)); \
+    r##part[8] = (uint32_t)(_mm_cvtsi128_si32(_mm_srli_si128(bx1, 8))); \
+    v4_random_math(code##part, r##part); \
+  }
+
+#endif /* XMRIG_CRYPTONIGHT_MONERO_H */
--- a/algo/cryptonight/cryptonight_r_av1.c
+++ b/algo/cryptonight/cryptonight_r_av1.c
@ -0,0 +1,143 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <x86intrin.h>
+#include <string.h>
+
+#include "crypto/c_keccak.h"
+#include "cryptonight.h"
+#include "cryptonight_aesni.h"
+#include "cryptonight_monero.h"
+
+
+void cryptonight_r_av1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input, size, ctx[0]->state, 200);
+
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+
+    VARIANT2_INIT(0);
+    VARIANT2_SET_ROUNDING_MODE();
+    VARIANT4_RANDOM_MATH_INIT(0);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+
+    uint64_t idx0 = al0;
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx        = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+
+        cx = _mm_aesenc_si128(cx, ax0);
+
+        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, cx);
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
+
+        idx0 = _mm_cvtsi128_si64(cx);
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+
+        VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx0, bx1);
+        al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32);
+        ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32);
+
+        lo = _umul128(idx0, cl, &hi);
+        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, cx);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        al0 ^= cl;
+        ah0 ^= ch;
+        idx0 = al0;
+
+        bx1 = bx0;
+        bx0 = cx;
+    }
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+
+    keccakf(h0, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+
+
+#ifndef XMRIG_NO_ASM
+void v4_compile_code(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM);
+
+
+void cryptonight_r_av1_asm_intel(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    if (ctx[0]->generated_code_height != ctx[0]->height) {
+        struct V4_Instruction code[256];
+        const int code_size = v4_random_math_init(code, ctx[0]->height);
+
+        v4_compile_code(code, code_size, (void*)(ctx[0]->generated_code), ASM_INTEL);
+        ctx[0]->generated_code_height = ctx[0]->height;
+    }
+
+    keccak(input, size, ctx[0]->state, 200);
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    ctx[0]->generated_code(ctx[0]);
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+    keccakf((uint64_t*) ctx[0]->state, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+
+
+void cryptonight_r_av1_asm_bulldozer(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    if (ctx[0]->generated_code_height != ctx[0]->height) {
+        struct V4_Instruction code[256];
+        const int code_size = v4_random_math_init(code, ctx[0]->height);
+
+        v4_compile_code(code, code_size, (void*)(ctx[0]->generated_code), ASM_BULLDOZER);
+        ctx[0]->generated_code_height = ctx[0]->height;
+    }
+
+    keccak(input, size, ctx[0]->state, 200);
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+    ctx[0]->generated_code(ctx[0]);
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+    keccakf((uint64_t*) ctx[0]->state, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
+#endif
--- a/algo/cryptonight/cryptonight_r_av2.c
+++ b/algo/cryptonight/cryptonight_r_av2.c
@ -0,0 +1,202 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <x86intrin.h>
+#include <string.h>
+
+#include "crypto/c_keccak.h"
+#include "cryptonight.h"
+#include "cryptonight_aesni.h"
+#include "cryptonight_monero.h"
+
+
+void cryptonight_r_av2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    const uint8_t* l1 = ctx[1]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+    uint64_t* h1 = (uint64_t*) ctx[1]->state;
+
+    VARIANT2_INIT(0);
+    VARIANT2_INIT(1);
+    VARIANT2_SET_ROUNDING_MODE();
+    VARIANT4_RANDOM_MATH_INIT(0);
+    VARIANT4_RANDOM_MATH_INIT(1);
+
+    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
+    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t al1 = h1[0] ^ h1[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    uint64_t ah1 = h1[1] ^ h1[5];
+
+    __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx01 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+    __m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+    __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]);
+
+    uint64_t idx0 = al0;
+    uint64_t idx1 = al1;
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx0       = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        __m128i cx1       = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
+
+        const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+        const __m128i ax1 = _mm_set_epi64x(ah1, al1);
+
+        cx0 = _mm_aesenc_si128(cx0, ax0);
+        cx1 = _mm_aesenc_si128(cx1, ax1);
+
+        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, cx0);
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx00, cx0));
+
+        VARIANT4_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, cx1);
+        _mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx10, cx1));
+
+        idx0 = _mm_cvtsi128_si64(cx0);
+        idx1 = _mm_cvtsi128_si64(cx1);
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+
+        VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx01);
+        al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32);
+        ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32);
+
+        lo = _umul128(idx0, cl, &hi);
+        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, cx0);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        al0 ^= cl;
+        ah0 ^= ch;
+        idx0 = al0;
+
+        cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
+
+        VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx10, bx11);
+        al1 ^= r1[2] | ((uint64_t)(r1[3]) << 32);
+        ah1 ^= r1[0] | ((uint64_t)(r1[1]) << 32);
+
+        lo = _umul128(idx1, cl, &hi);
+        VARIANT4_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, cx1);
+
+        al1 += hi;
+        ah1 += lo;
+
+        ((uint64_t*)&l1[idx1 & 0x1FFFF0])[0] = al1;
+        ((uint64_t*)&l1[idx1 & 0x1FFFF0])[1] = ah1;
+
+        al1 ^= cl;
+        ah1 ^= ch;
+        idx1 = al1;
+
+        bx01 = bx00;
+        bx11 = bx10;
+
+        bx00 = cx0;
+        bx10 = cx1;
+    }
+
+    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
+    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
+
+    keccakf(h0, 24);
+    keccakf(h1, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
+
+
+#ifndef XMRIG_NO_ASM
+void v4_compile_code_double(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM);
+
+
+void cryptonight_r_av2_asm_intel(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    if (ctx[0]->generated_code_height != ctx[0]->height) {
+        struct V4_Instruction code[256];
+        const int code_size = v4_random_math_init(code, ctx[0]->height);
+        v4_compile_code_double(code, code_size, (void*)(ctx[0]->generated_code_double), ASM_INTEL);
+        ctx[0]->generated_code_height = ctx[0]->height;
+    }
+
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+    cn_explode_scratchpad((__m128i*) ctx[1]->state, (__m128i*) ctx[1]->memory);
+
+    ctx[0]->generated_code_double(ctx[0], ctx[1]);
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+    cn_implode_scratchpad((__m128i*) ctx[1]->memory, (__m128i*) ctx[1]->state);
+
+    keccakf((uint64_t *) ctx[0]->state, 24);
+    keccakf((uint64_t *) ctx[1]->state, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
+
+
+void cryptonight_r_av2_asm_bulldozer(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    if (ctx[0]->generated_code_height != ctx[0]->height) {
+        struct V4_Instruction code[256];
+        const int code_size = v4_random_math_init(code, ctx[0]->height);
+        v4_compile_code_double(code, code_size, (void*)(ctx[0]->generated_code_double), ASM_BULLDOZER);
+        ctx[0]->generated_code_height = ctx[0]->height;
+    }
+
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+    cn_explode_scratchpad((__m128i*) ctx[1]->state, (__m128i*) ctx[1]->memory);
+
+    ctx[0]->generated_code_double(ctx[0], ctx[1]);
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+    cn_implode_scratchpad((__m128i*) ctx[1]->memory, (__m128i*) ctx[1]->state);
+
+    keccakf((uint64_t *) ctx[0]->state, 24);
+    keccakf((uint64_t *) ctx[1]->state, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
+#endif
--- a/algo/cryptonight/cryptonight_r_av3.c
+++ b/algo/cryptonight/cryptonight_r_av3.c
@ -0,0 +1,112 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <x86intrin.h>
+#include <string.h>
+
+#include "crypto/c_keccak.h"
+#include "cryptonight.h"
+#include "cryptonight_monero.h"
+#include "cryptonight_softaes.h"
+
+
+#ifndef XMRIG_NO_ASM
+void v4_soft_aes_compile_code(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM);
+#endif
+
+
+void cryptonight_r_av3(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input, size, ctx[0]->state, 200);
+    cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
+
+#   ifndef XMRIG_NO_ASM
+    if (ctx[0]->generated_code_height != ctx[0]->height) {
+        struct V4_Instruction code[256];
+        const int code_size = v4_random_math_init(code, ctx[0]->height);
+
+        v4_soft_aes_compile_code(code, code_size, (void*)(ctx[0]->generated_code), ASM_NONE);
+        ctx[0]->generated_code_height = ctx[0]->height;
+    }
+
+    ctx[0]->saes_table = (const uint32_t*)saes_table;
+    ctx[0]->generated_code(ctx[0]);
+#   else
+    const uint8_t* l0 = ctx[0]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+
+    VARIANT2_INIT(0);
+    VARIANT2_SET_ROUNDING_MODE();
+    VARIANT4_RANDOM_MATH_INIT(0);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+
+    uint64_t idx0 = al0;
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx        = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+
+        cx = soft_aesenc(cx, ax0);
+
+        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, cx);
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
+
+        idx0 = _mm_cvtsi128_si64(cx);
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+
+        VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx0, bx1);
+        al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32);
+        ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32);
+
+        lo = _umul128(idx0, cl, &hi);
+        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, cx);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        al0 ^= cl;
+        ah0 ^= ch;
+        idx0 = al0;
+
+        bx1 = bx0;
+        bx0 = cx;
+    }
+#   endif
+
+    cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
+    keccakf((uint64_t *) ctx[0]->state, 24);
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+}
--- a/algo/cryptonight/cryptonight_r_av4.c
+++ b/algo/cryptonight/cryptonight_r_av4.c
@ -0,0 +1,143 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <x86intrin.h>
+#include <string.h>
+
+#include "crypto/c_keccak.h"
+#include "cryptonight.h"
+#include "cryptonight_monero.h"
+#include "cryptonight_softaes.h"
+
+
+void cryptonight_r_av4(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
+{
+    keccak(input,        size, ctx[0]->state, 200);
+    keccak(input + size, size, ctx[1]->state, 200);
+
+    const uint8_t* l0 = ctx[0]->memory;
+    const uint8_t* l1 = ctx[1]->memory;
+    uint64_t* h0 = (uint64_t*) ctx[0]->state;
+    uint64_t* h1 = (uint64_t*) ctx[1]->state;
+
+    VARIANT2_INIT(0);
+    VARIANT2_INIT(1);
+    VARIANT2_SET_ROUNDING_MODE();
+    VARIANT4_RANDOM_MATH_INIT(0);
+    VARIANT4_RANDOM_MATH_INIT(1);
+
+    cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
+    cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
+
+    uint64_t al0 = h0[0] ^ h0[4];
+    uint64_t al1 = h1[0] ^ h1[4];
+    uint64_t ah0 = h0[1] ^ h0[5];
+    uint64_t ah1 = h1[1] ^ h1[5];
+
+    __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+    __m128i bx01 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+    __m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+    __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]);
+
+    uint64_t idx0 = al0;
+    uint64_t idx1 = al1;
+
+    for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
+        __m128i cx0       = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
+        __m128i cx1       = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
+
+        const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+        const __m128i ax1 = _mm_set_epi64x(ah1, al1);
+
+        cx0 = soft_aesenc(cx0, ax0);
+        cx1 = soft_aesenc(cx1, ax1);
+
+        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, cx0);
+        _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx00, cx0));
+
+        VARIANT4_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, cx1);
+        _mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx10, cx1));
+
+        idx0 = _mm_cvtsi128_si64(cx0);
+        idx1 = _mm_cvtsi128_si64(cx1);
+
+        uint64_t hi, lo, cl, ch;
+        cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
+
+        VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx01);
+        al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32);
+        ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32);
+
+        lo = _umul128(idx0, cl, &hi);
+        VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, cx0);
+
+        al0 += hi;
+        ah0 += lo;
+
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
+        ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
+
+        al0 ^= cl;
+        ah0 ^= ch;
+        idx0 = al0;
+
+        cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
+        ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
+
+        VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx10, bx11);
+        al1 ^= r1[2] | ((uint64_t)(r1[3]) << 32);
+        ah1 ^= r1[0] | ((uint64_t)(r1[1]) << 32);
+
+        lo = _umul128(idx1, cl, &hi);
+        VARIANT4_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, cx1);
+
+        al1 += hi;
+        ah1 += lo;
+
+        ((uint64_t*)&l1[idx1 & 0x1FFFF0])[0] = al1;
+        ((uint64_t*)&l1[idx1 & 0x1FFFF0])[1] = ah1;
+
+        al1 ^= cl;
+        ah1 ^= ch;
+        idx1 = al1;
+
+        bx01 = bx00;
+        bx11 = bx10;
+
+        bx00 = cx0;
+        bx10 = cx1;
+    }
+
+    cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
+    cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
+
+    keccakf(h0, 24);
+    keccakf(h1, 24);
+
+    extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
+    extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
+}
--- a/algo/cryptonight/cryptonight_softaes.h
+++ b/algo/cryptonight/cryptonight_softaes.h
@ -4,9 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2017      fireice-uk  <https://github.com/fireice-uk>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -22,13 +22,15 @@
 *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef __CRYPTONIGHT_SOFTAES_H__
-#define __CRYPTONIGHT_SOFTAES_H__
+#ifndef XMRIG_CRYPTONIGHT_SOFTAES_H
+#define XMRIG_CRYPTONIGHT_SOFTAES_H
+

 #include <x86intrin.h>
+#include <stdint.h>
+

-extern __m128i soft_aesenc(__m128i in, __m128i key);
-extern __m128i soft_aeskeygenassist(__m128i key, uint8_t rcon);
+#include "crypto/soft_aes.h"


 // This will shift and xor tmp1 into itself as 4 32-bit vals such as
@ -234,4 +236,20 @@ inline uint64_t _umul128(uint64_t multiplier, uint64_t multiplicand, uint64_t *p
 #endif


-#endif /* __CRYPTONIGHT_SOFTAES_H__ */
+static inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
+{
+    mem_out[0] = EXTRACT64(tmp);
+
+    tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
+    uint64_t vh = EXTRACT64(tmp);
+
+    uint8_t x = vh >> 24;
+    static const uint16_t table = 0x7531;
+    const uint8_t index = (((x >> 3) & 6) | (x & 1)) << 1;
+    vh ^= ((table >> index) & 0x3) << 28;
+
+    mem_out[1] = vh;
+}
+
+
+#endif /* XMRIG_CRYPTONIGHT_SOFTAES_H */
--- a/algo/cryptonight/cryptonight_test.h
+++ b/algo/cryptonight/cryptonight_test.h
@ -0,0 +1,129 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef XMRIG_CRYPTONIGHT_TEST_H
+#define XMRIG_CRYPTONIGHT_TEST_H
+
+
+#include <stdint.h>
+
+
+const static uint8_t test_input[152] = {
+    0x03, 0x05, 0xA0, 0xDB, 0xD6, 0xBF, 0x05, 0xCF, 0x16, 0xE5, 0x03, 0xF3, 0xA6, 0x6F, 0x78, 0x00,
+    0x7C, 0xBF, 0x34, 0x14, 0x43, 0x32, 0xEC, 0xBF, 0xC2, 0x2E, 0xD9, 0x5C, 0x87, 0x00, 0x38, 0x3B,
+    0x30, 0x9A, 0xCE, 0x19, 0x23, 0xA0, 0x96, 0x4B, 0x00, 0x00, 0x00, 0x08, 0xBA, 0x93, 0x9A, 0x62,
+    0x72, 0x4C, 0x0D, 0x75, 0x81, 0xFC, 0xE5, 0x76, 0x1E, 0x9D, 0x8A, 0x0E, 0x6A, 0x1C, 0x3F, 0x92,
+    0x4F, 0xDD, 0x84, 0x93, 0xD1, 0x11, 0x56, 0x49, 0xC0, 0x5E, 0xB6, 0x01,
+    0x01, 0x00, 0xFB, 0x8E, 0x8A, 0xC8, 0x05, 0x89, 0x93, 0x23, 0x37, 0x1B, 0xB7, 0x90, 0xDB, 0x19,
+    0x21, 0x8A, 0xFD, 0x8D, 0xB8, 0xE3, 0x75, 0x5D, 0x8B, 0x90, 0xF3, 0x9B, 0x3D, 0x55, 0x06, 0xA9,
+    0xAB, 0xCE, 0x4F, 0xA9, 0x12, 0x24, 0x45, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x81, 0x46, 0xD4, 0x9F,
+    0xA9, 0x3E, 0xE7, 0x24, 0xDE, 0xB5, 0x7D, 0x12, 0xCB, 0xC6, 0xC6, 0xF3, 0xB9, 0x24, 0xD9, 0x46,
+    0x12, 0x7C, 0x7A, 0x97, 0x41, 0x8F, 0x93, 0x48, 0x82, 0x8F, 0x0F, 0x02
+};
+
+
+const static uint8_t test_output_v0[64] = {
+    0x1A, 0x3F, 0xFB, 0xEE, 0x90, 0x9B, 0x42, 0x0D, 0x91, 0xF7, 0xBE, 0x6E, 0x5F, 0xB5, 0x6D, 0xB7,
+    0x1B, 0x31, 0x10, 0xD8, 0x86, 0x01, 0x1E, 0x87, 0x7E, 0xE5, 0x78, 0x6A, 0xFD, 0x08, 0x01, 0x00,
+    0x1B, 0x60, 0x6A, 0x3F, 0x4A, 0x07, 0xD6, 0x48, 0x9A, 0x1B, 0xCD, 0x07, 0x69, 0x7B, 0xD1, 0x66,
+    0x96, 0xB6, 0x1C, 0x8A, 0xE9, 0x82, 0xF6, 0x1A, 0x90, 0x16, 0x0F, 0x4E, 0x52, 0x82, 0x8A, 0x7F
+};
+
+
+// Cryptonight variant 1 (Monero v7)
+const static uint8_t test_output_v1[64] = {
+    0xF2, 0x2D, 0x3D, 0x62, 0x03, 0xD2, 0xA0, 0x8B, 0x41, 0xD9, 0x02, 0x72, 0x78, 0xD8, 0xBC, 0xC9,
+    0x83, 0xAC, 0xAD, 0xA9, 0xB6, 0x8E, 0x52, 0xE3, 0xC6, 0x89, 0x69, 0x2A, 0x50, 0xE9, 0x21, 0xD9,
+    0xC9, 0xFA, 0xE8, 0x42, 0x5D, 0x86, 0x88, 0xDC, 0x23, 0x6B, 0xCD, 0xBC, 0x42, 0xFD, 0xB4, 0x2D,
+    0x37, 0x6C, 0x6E, 0xC1, 0x90, 0x50, 0x1A, 0xA8, 0x4B, 0x04, 0xA4, 0xB4, 0xCF, 0x1E, 0xE1, 0x22
+};
+
+
+// Cryptonight variant 2 (Monero v8)
+const static uint8_t test_output_v2[64] = {
+    0x97, 0x37, 0x82, 0x82, 0xCF, 0x10, 0xE7, 0xAD, 0x03, 0x3F, 0x7B, 0x80, 0x74, 0xC4, 0x0E, 0x14,
+    0xD0, 0x6E, 0x7F, 0x60, 0x9D, 0xDD, 0xDA, 0x78, 0x76, 0x80, 0xB5, 0x8C, 0x05, 0xF4, 0x3D, 0x21,
+    0x87, 0x1F, 0xCD, 0x68, 0x23, 0xF6, 0xA8, 0x79, 0xBB, 0x3F, 0x33, 0x95, 0x1C, 0x8E, 0x8E, 0x89,
+    0x1D, 0x40, 0x43, 0x88, 0x0B, 0x02, 0xDF, 0xA1, 0xBB, 0x3B, 0xE4, 0x98, 0xB5, 0x0E, 0x75, 0x78
+};
+
+
+struct cn_r_test_input_data
+{
+    uint64_t height;
+    size_t size;
+    uint8_t data[64];
+};
+
+
+const static struct cn_r_test_input_data cn_r_test_input[] = {
+    { 1806260, 44, { 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74 } },
+    { 1806261, 50, { 0x4c, 0x6f, 0x72, 0x65, 0x6d, 0x20, 0x69, 0x70, 0x73, 0x75, 0x6d, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x20, 0x73, 0x69, 0x74, 0x20, 0x61, 0x6d, 0x65, 0x74, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x65, 0x63, 0x74, 0x65, 0x74, 0x75, 0x72, 0x20, 0x61, 0x64, 0x69, 0x70, 0x69, 0x73, 0x63, 0x69, 0x6e, 0x67 } },
+    { 1806262, 48, { 0x65, 0x6c, 0x69, 0x74, 0x2c, 0x20, 0x73, 0x65, 0x64, 0x20, 0x64, 0x6f, 0x20, 0x65, 0x69, 0x75, 0x73, 0x6d, 0x6f, 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6f, 0x72, 0x20, 0x69, 0x6e, 0x63, 0x69, 0x64, 0x69, 0x64, 0x75, 0x6e, 0x74, 0x20, 0x75, 0x74, 0x20, 0x6c, 0x61, 0x62, 0x6f, 0x72, 0x65 } },
+    { 1806263, 48, { 0x65, 0x74, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x65, 0x20, 0x6d, 0x61, 0x67, 0x6e, 0x61, 0x20, 0x61, 0x6c, 0x69, 0x71, 0x75, 0x61, 0x2e, 0x20, 0x55, 0x74, 0x20, 0x65, 0x6e, 0x69, 0x6d, 0x20, 0x61, 0x64, 0x20, 0x6d, 0x69, 0x6e, 0x69, 0x6d, 0x20, 0x76, 0x65, 0x6e, 0x69, 0x61, 0x6d, 0x2c } },
+    { 1806264, 46, { 0x71, 0x75, 0x69, 0x73, 0x20, 0x6e, 0x6f, 0x73, 0x74, 0x72, 0x75, 0x64, 0x20, 0x65, 0x78, 0x65, 0x72, 0x63, 0x69, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x75, 0x6c, 0x6c, 0x61, 0x6d, 0x63, 0x6f, 0x20, 0x6c, 0x61, 0x62, 0x6f, 0x72, 0x69, 0x73, 0x20, 0x6e, 0x69, 0x73, 0x69 } },
+    { 1806265, 45, { 0x75, 0x74, 0x20, 0x61, 0x6c, 0x69, 0x71, 0x75, 0x69, 0x70, 0x20, 0x65, 0x78, 0x20, 0x65, 0x61, 0x20, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x64, 0x6f, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x65, 0x71, 0x75, 0x61, 0x74, 0x2e, 0x20, 0x44, 0x75, 0x69, 0x73, 0x20, 0x61, 0x75, 0x74, 0x65 } },
+    { 1806266, 47, { 0x69, 0x72, 0x75, 0x72, 0x65, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x20, 0x69, 0x6e, 0x20, 0x72, 0x65, 0x70, 0x72, 0x65, 0x68, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x69, 0x74, 0x20, 0x69, 0x6e, 0x20, 0x76, 0x6f, 0x6c, 0x75, 0x70, 0x74, 0x61, 0x74, 0x65, 0x20, 0x76, 0x65, 0x6c, 0x69, 0x74 } },
+    { 1806267, 44, { 0x65, 0x73, 0x73, 0x65, 0x20, 0x63, 0x69, 0x6c, 0x6c, 0x75, 0x6d, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x65, 0x20, 0x65, 0x75, 0x20, 0x66, 0x75, 0x67, 0x69, 0x61, 0x74, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x61, 0x20, 0x70, 0x61, 0x72, 0x69, 0x61, 0x74, 0x75, 0x72, 0x2e } },
+    { 1806268, 47, { 0x45, 0x78, 0x63, 0x65, 0x70, 0x74, 0x65, 0x75, 0x72, 0x20, 0x73, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x63, 0x63, 0x61, 0x65, 0x63, 0x61, 0x74, 0x20, 0x63, 0x75, 0x70, 0x69, 0x64, 0x61, 0x74, 0x61, 0x74, 0x20, 0x6e, 0x6f, 0x6e, 0x20, 0x70, 0x72, 0x6f, 0x69, 0x64, 0x65, 0x6e, 0x74, 0x2c } },
+    { 1806269, 62, { 0x73, 0x75, 0x6e, 0x74, 0x20, 0x69, 0x6e, 0x20, 0x63, 0x75, 0x6c, 0x70, 0x61, 0x20, 0x71, 0x75, 0x69, 0x20, 0x6f, 0x66, 0x66, 0x69, 0x63, 0x69, 0x61, 0x20, 0x64, 0x65, 0x73, 0x65, 0x72, 0x75, 0x6e, 0x74, 0x20, 0x6d, 0x6f, 0x6c, 0x6c, 0x69, 0x74, 0x20, 0x61, 0x6e, 0x69, 0x6d, 0x20, 0x69, 0x64, 0x20, 0x65, 0x73, 0x74, 0x20, 0x6c, 0x61, 0x62, 0x6f, 0x72, 0x75, 0x6d, 0x2e } },
+};
+
+
+// "cn/r"
+const static uint8_t test_output_r[] = {
+    0xf7, 0x59, 0x58, 0x8a, 0xd5, 0x7e, 0x75, 0x84, 0x67, 0x29, 0x54, 0x43, 0xa9, 0xbd, 0x71, 0x49, 0x0a, 0xbf, 0xf8, 0xe9, 0xda, 0xd1, 0xb9, 0x5b, 0x6b, 0xf2, 0xf5, 0xd0, 0xd7, 0x83, 0x87, 0xbc,
+    0x5b, 0xb8, 0x33, 0xde, 0xca, 0x2b, 0xdd, 0x72, 0x52, 0xa9, 0xcc, 0xd7, 0xb4, 0xce, 0x0b, 0x6a, 0x48, 0x54, 0x51, 0x57, 0x94, 0xb5, 0x6c, 0x20, 0x72, 0x62, 0xf7, 0xa5, 0xb9, 0xbd, 0xb5, 0x66,
+    0x1e, 0xe6, 0x72, 0x8d, 0xa6, 0x0f, 0xbd, 0x8d, 0x7d, 0x55, 0xb2, 0xb1, 0xad, 0xe4, 0x87, 0xa3, 0xcf, 0x52, 0xa2, 0xc3, 0xac, 0x6f, 0x52, 0x0d, 0xb1, 0x2c, 0x27, 0xd8, 0x92, 0x1f, 0x6c, 0xab,
+    0x69, 0x69, 0xfe, 0x2d, 0xdf, 0xb7, 0x58, 0x43, 0x8d, 0x48, 0x04, 0x9f, 0x30, 0x2f, 0xc2, 0x10, 0x8a, 0x4f, 0xcc, 0x93, 0xe3, 0x76, 0x69, 0x17, 0x0e, 0x6d, 0xb4, 0xb0, 0xb9, 0xb4, 0xc4, 0xcb,
+    0x7f, 0x30, 0x48, 0xb4, 0xe9, 0x0d, 0x0c, 0xbe, 0x7a, 0x57, 0xc0, 0x39, 0x4f, 0x37, 0x33, 0x8a, 0x01, 0xfa, 0xe3, 0xad, 0xfd, 0xc0, 0xe5, 0x12, 0x6d, 0x86, 0x3a, 0x89, 0x5e, 0xb0, 0x4e, 0x02,
+    0x1d, 0x29, 0x04, 0x43, 0xa4, 0xb5, 0x42, 0xaf, 0x04, 0xa8, 0x2f, 0x6b, 0x24, 0x94, 0xa6, 0xee, 0x7f, 0x20, 0xf2, 0x75, 0x4c, 0x58, 0xe0, 0x84, 0x90, 0x32, 0x48, 0x3a, 0x56, 0xe8, 0xe2, 0xef,
+    0xc4, 0x3c, 0xc6, 0x56, 0x74, 0x36, 0xa8, 0x6a, 0xfb, 0xd6, 0xaa, 0x9e, 0xaa, 0x7c, 0x27, 0x6e, 0x98, 0x06, 0x83, 0x03, 0x34, 0xb6, 0x14, 0xb2, 0xbe, 0xe2, 0x3c, 0xc7, 0x66, 0x34, 0xf6, 0xfd,
+    0x87, 0xbe, 0x24, 0x79, 0xc0, 0xc4, 0xe8, 0xed, 0xfd, 0xfa, 0xa5, 0x60, 0x3e, 0x93, 0xf4, 0x26, 0x5b, 0x3f, 0x82, 0x24, 0xc1, 0xc5, 0x94, 0x6f, 0xeb, 0x42, 0x48, 0x19, 0xd1, 0x89, 0x90, 0xa4,
+    0xdd, 0x9d, 0x6a, 0x6d, 0x8e, 0x47, 0x46, 0x5c, 0xce, 0xac, 0x08, 0x77, 0xef, 0x88, 0x9b, 0x93, 0xe7, 0xeb, 0xa9, 0x79, 0x55, 0x7e, 0x39, 0x35, 0xd7, 0xf8, 0x6d, 0xce, 0x11, 0xb0, 0x70, 0xf3,
+    0x75, 0xc6, 0xf2, 0xae, 0x49, 0xa2, 0x05, 0x21, 0xde, 0x97, 0x28, 0x5b, 0x43, 0x1e, 0x71, 0x71, 0x25, 0x84, 0x7f, 0xb8, 0x93, 0x5e, 0xd8, 0x4a, 0x61, 0xe7, 0xf8, 0xd3, 0x6a, 0x2c, 0x3d, 0x8e,
+};
+
+
+#ifndef XMRIG_NO_AEON
+const static uint8_t test_output_v0_lite[64] = {
+    0x36, 0x95, 0xB4, 0xB5, 0x3B, 0xB0, 0x03, 0x58, 0xB0, 0xAD, 0x38, 0xDC, 0x16, 0x0F, 0xEB, 0x9E,
+    0x00, 0x4E, 0xEC, 0xE0, 0x9B, 0x83, 0xA7, 0x2E, 0xF6, 0xBA, 0x98, 0x64, 0xD3, 0x51, 0x0C, 0x88,
+    0x28, 0xA2, 0x2B, 0xAD, 0x3F, 0x93, 0xD1, 0x40, 0x8F, 0xCA, 0x47, 0x2E, 0xB5, 0xAD, 0x1C, 0xBE,
+    0x75, 0xF2, 0x1D, 0x05, 0x3C, 0x8C, 0xE5, 0xB3, 0xAF, 0x10, 0x5A, 0x57, 0x71, 0x3E, 0x21, 0xDD
+};
+
+
+// AEON v7
+const static uint8_t test_output_v1_lite[64] = {
+    0x6D, 0x8C, 0xDC, 0x44, 0x4E, 0x9B, 0xBB, 0xFD, 0x68, 0xFC, 0x43, 0xFC, 0xD4, 0x85, 0x5B, 0x22,
+    0x8C, 0x8A, 0x1B, 0xD9, 0x1D, 0x9D, 0x00, 0x28, 0x5B, 0xEC, 0x02, 0xB7, 0xCA, 0x2D, 0x67, 0x41,
+    0x87, 0xC4, 0xE5, 0x70, 0x65, 0x3E, 0xB4, 0xC2, 0xB4, 0x2B, 0x7A, 0x0D, 0x54, 0x65, 0x59, 0x45,
+    0x2D, 0xFA, 0xB5, 0x73, 0xB8, 0x2E, 0xC5, 0x2F, 0x15, 0x2B, 0x7F, 0xF9, 0x8E, 0x79, 0x44, 0x6F
+};
+#endif
+
+
+#endif /* XMRIG_CRYPTONIGHT_TEST_H */
--- a/algo/cryptonight/variant4_random_math.h
+++ b/algo/cryptonight/variant4_random_math.h
@ -0,0 +1,449 @@
+#ifndef VARIANT4_RANDOM_MATH_H
+#define VARIANT4_RANDOM_MATH_H
+
+
+#include <stdint.h>
+#include <string.h>
+#include <stdbool.h>
+
+
+#include "crypto/c_blake256.h"
+
+
+enum V4_Settings
+{
+    // Generate code with minimal theoretical latency = 45 cycles, which is equivalent to 15 multiplications
+    TOTAL_LATENCY = 15 * 3,
+    
+    // Always generate at least 60 instructions
+    NUM_INSTRUCTIONS_MIN = 60,
+
+    // Never generate more than 70 instructions (final RET instruction doesn't count here)
+    NUM_INSTRUCTIONS_MAX = 70,
+
+    // Available ALUs for MUL
+    // Modern CPUs typically have only 1 ALU which can do multiplications
+    ALU_COUNT_MUL = 1,
+
+    // Total available ALUs
+    // Modern CPUs have 4 ALUs, but we use only 3 because random math executes together with other main loop code
+    ALU_COUNT = 3,
+};
+
+enum V4_InstructionList
+{
+    MUL,    // a*b
+    ADD,    // a+b + C, C is an unsigned 32-bit constant
+    SUB,    // a-b
+    ROR,    // rotate right "a" by "b & 31" bits
+    ROL,    // rotate left "a" by "b & 31" bits
+    XOR,    // a^b
+    RET,    // finish execution
+    V4_INSTRUCTION_COUNT = RET,
+};
+
+// V4_InstructionDefinition is used to generate code from random data
+// Every random sequence of bytes is a valid code
+//
+// There are 9 registers in total:
+// - 4 variable registers
+// - 5 constant registers initialized from loop variables
+// This is why dst_index is 2 bits
+enum V4_InstructionDefinition
+{
+    V4_OPCODE_BITS = 3,
+    V4_DST_INDEX_BITS = 2,
+    V4_SRC_INDEX_BITS = 3,
+};
+
+struct V4_Instruction
+{
+    uint8_t opcode;
+    uint8_t dst_index;
+    uint8_t src_index;
+    uint32_t C;
+};
+
+#ifndef FORCEINLINE
+#ifdef __GNUC__
+#define FORCEINLINE __attribute__((always_inline)) inline
+#elif _MSC_VER
+#define FORCEINLINE __forceinline
+#else
+#define FORCEINLINE inline
+#endif
+#endif
+
+#ifndef UNREACHABLE_CODE
+#ifdef __GNUC__
+#define UNREACHABLE_CODE __builtin_unreachable()
+#elif _MSC_VER
+#define UNREACHABLE_CODE __assume(false)
+#else
+#define UNREACHABLE_CODE
+#endif
+#endif
+
+#define SWAP32LE(x) x
+#define SWAP64LE(x) x
+#define hash_extra_blake(data, length, hash) blake256_hash((uint8_t*)(hash), (uint8_t*)(data), (length))
+
+// Random math interpreter's loop is fully unrolled and inlined to achieve 100% branch prediction on CPU:
+// every switch-case will point to the same destination on every iteration of Cryptonight main loop
+//
+// This is about as fast as it can get without using low-level machine code generation
+//template<typename v4_reg>
+static void v4_random_math(const struct V4_Instruction* code, uint32_t r[9])
+{
+#define REG_BITS 32
+#define V4_EXEC(i) \
+    { \
+        const struct V4_Instruction* op = code + i; \
+        const uint32_t src = r[op->src_index]; \
+        uint32_t *dst = r + op->dst_index; \
+        switch (op->opcode) \
+        { \
+        case MUL: \
+            *dst *= src; \
+            break; \
+        case ADD: \
+            *dst += src + op->C; \
+            break; \
+        case SUB: \
+            *dst -= src; \
+            break; \
+        case ROR: \
+            { \
+                const uint32_t shift = src % REG_BITS; \
+                *dst = (*dst >> shift) | (*dst << ((REG_BITS - shift) % REG_BITS)); \
+            } \
+            break; \
+        case ROL: \
+            { \
+                const uint32_t shift = src % REG_BITS; \
+                *dst = (*dst << shift) | (*dst >> ((REG_BITS - shift) % REG_BITS)); \
+            } \
+            break; \
+        case XOR: \
+            *dst ^= src; \
+            break; \
+        case RET: \
+            return; \
+        default: \
+            UNREACHABLE_CODE; \
+            break; \
+        } \
+    }
+
+#define V4_EXEC_10(j) \
+    V4_EXEC(j + 0) \
+    V4_EXEC(j + 1) \
+    V4_EXEC(j + 2) \
+    V4_EXEC(j + 3) \
+    V4_EXEC(j + 4) \
+    V4_EXEC(j + 5) \
+    V4_EXEC(j + 6) \
+    V4_EXEC(j + 7) \
+    V4_EXEC(j + 8) \
+    V4_EXEC(j + 9)
+
+    // Generated program can have 60 + a few more (usually 2-3) instructions to achieve required latency
+    // I've checked all block heights < 10,000,000 and here is the distribution of program sizes:
+    //
+    // 60      27960
+    // 61      105054
+    // 62      2452759
+    // 63      5115997
+    // 64      1022269
+    // 65      1109635
+    // 66      153145
+    // 67      8550
+    // 68      4529
+    // 69      102
+
+    // Unroll 70 instructions here
+    V4_EXEC_10(0);      // instructions 0-9
+    V4_EXEC_10(10);     // instructions 10-19
+    V4_EXEC_10(20);     // instructions 20-29
+    V4_EXEC_10(30);     // instructions 30-39
+    V4_EXEC_10(40);     // instructions 40-49
+    V4_EXEC_10(50);     // instructions 50-59
+    V4_EXEC_10(60);     // instructions 60-69
+
+#undef V4_EXEC_10
+#undef V4_EXEC
+#undef REG_BITS
+}
+
+// If we don't have enough data available, generate more
+static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed, int8_t* data, const size_t data_size)
+{
+    if (*data_index + bytes_needed > data_size)
+    {
+        hash_extra_blake(data, data_size, (char*) data);
+        *data_index = 0;
+    }
+}
+
+// Generates as many random math operations as possible with given latency and ALU restrictions
+// "code" array must have space for NUM_INSTRUCTIONS_MAX+1 instructions
+static int v4_random_math_init(struct V4_Instruction* code, const uint64_t height)
+{
+    // MUL is 3 cycles, 3-way addition and rotations are 2 cycles, SUB/XOR are 1 cycle
+    // These latencies match real-life instruction latencies for Intel CPUs starting from Sandy Bridge and up to Skylake/Coffee lake
+    //
+    // AMD Ryzen has the same latencies except 1-cycle ROR/ROL, so it'll be a bit faster than Intel Sandy Bridge and newer processors
+    // Surprisingly, Intel Nehalem also has 1-cycle ROR/ROL, so it'll also be faster than Intel Sandy Bridge and newer processors
+    // AMD Bulldozer has 4 cycles latency for MUL (slower than Intel) and 1 cycle for ROR/ROL (faster than Intel), so average performance will be the same
+    // Source: https://www.agner.org/optimize/instruction_tables.pdf
+    const int op_latency[V4_INSTRUCTION_COUNT] = { 3, 2, 1, 2, 2, 1 };
+
+    // Instruction latencies for theoretical ASIC implementation
+    const int asic_op_latency[V4_INSTRUCTION_COUNT] = { 3, 1, 1, 1, 1, 1 };
+
+    // Available ALUs for each instruction
+    const int op_ALUs[V4_INSTRUCTION_COUNT] = { ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT };
+
+    int8_t data[32];
+    memset(data, 0, sizeof(data));
+    uint64_t tmp = SWAP64LE(height);
+    memcpy(data, &tmp, sizeof(uint64_t));
+    data[20] = -38;
+
+    // Set data_index past the last byte in data
+    // to trigger full data update with blake hash
+    // before we start using it
+    size_t data_index = sizeof(data);
+
+    int code_size;
+
+    // There is a small chance (1.8%) that register R8 won't be used in the generated program
+    // So we keep track of it and try again if it's not used
+    bool r8_used;
+    do {
+        int latency[9];
+        int asic_latency[9];
+
+        // Tracks previous instruction and value of the source operand for registers R0-R3 throughout code execution
+        // byte 0: current value of the destination register
+        // byte 1: instruction opcode
+        // byte 2: current value of the source register
+        //
+        // Registers R4-R8 are constant and are treated as having the same value because when we do
+        // the same operation twice with two constant source registers, it can be optimized into a single operation
+        uint32_t inst_data[9] = { 0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF };
+
+        bool alu_busy[TOTAL_LATENCY + 1][ALU_COUNT];
+        bool is_rotation[V4_INSTRUCTION_COUNT];
+        bool rotated[4];
+        int rotate_count = 0;
+
+        memset(latency, 0, sizeof(latency));
+        memset(asic_latency, 0, sizeof(asic_latency));
+        memset(alu_busy, 0, sizeof(alu_busy));
+        memset(is_rotation, 0, sizeof(is_rotation));
+        memset(rotated, 0, sizeof(rotated));
+        is_rotation[ROR] = true;
+        is_rotation[ROL] = true;
+
+        int num_retries = 0;
+        code_size = 0;
+
+        int total_iterations = 0;
+        r8_used = false;
+
+        // Generate random code to achieve minimal required latency for our abstract CPU
+        // Try to get this latency for all 4 registers
+        while (((latency[0] < TOTAL_LATENCY) || (latency[1] < TOTAL_LATENCY) || (latency[2] < TOTAL_LATENCY) || (latency[3] < TOTAL_LATENCY)) && (num_retries < 64))
+        {
+            // Fail-safe to guarantee loop termination
+            ++total_iterations;
+            if (total_iterations > 256)
+                break;
+
+            check_data(&data_index, 1, data, sizeof(data));
+
+            const uint8_t c = ((uint8_t*)data)[data_index++];
+
+            // MUL = opcodes 0-2
+            // ADD = opcode 3
+            // SUB = opcode 4
+            // ROR/ROL = opcode 5, shift direction is selected randomly
+            // XOR = opcodes 6-7
+            uint8_t opcode = c & ((1 << V4_OPCODE_BITS) - 1);
+            if (opcode == 5)
+            {
+                check_data(&data_index, 1, data, sizeof(data));
+                opcode = (data[data_index++] >= 0) ? ROR : ROL;
+            }
+            else if (opcode >= 6)
+            {
+                opcode = XOR;
+            }
+            else
+            {
+                opcode = (opcode <= 2) ? MUL : (opcode - 2);
+            }
+
+            uint8_t dst_index = (c >> V4_OPCODE_BITS) & ((1 << V4_DST_INDEX_BITS) - 1);
+            uint8_t src_index = (c >> (V4_OPCODE_BITS + V4_DST_INDEX_BITS)) & ((1 << V4_SRC_INDEX_BITS) - 1);
+
+            const int a = dst_index;
+            int b = src_index;
+
+            // Don't do ADD/SUB/XOR with the same register
+            if (((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b))
+            {
+                // a is always < 4, so we don't need to check bounds here
+                b = 8;
+                src_index = b;
+            }
+
+            // Don't do rotation with the same destination twice because it's equal to a single rotation
+            if (is_rotation[opcode] && rotated[a])
+            {
+                continue;
+            }
+
+            // Don't do the same instruction (except MUL) with the same source value twice because all other cases can be optimized:
+            // 2xADD(a, b, C) = ADD(a, b*2, C1+C2), same for SUB and rotations
+            // 2xXOR(a, b) = NOP
+            if ((opcode != MUL) && ((inst_data[a] & 0xFFFF00) == (opcode << 8) + ((inst_data[b] & 255) << 16)))
+            {
+                continue;
+            }
+
+            // Find which ALU is available (and when) for this instruction
+            int next_latency = (latency[a] > latency[b]) ? latency[a] : latency[b];
+            int alu_index = -1;
+            while (next_latency < TOTAL_LATENCY)
+            {
+                for (int i = op_ALUs[opcode] - 1; i >= 0; --i)
+                {
+                    if (!alu_busy[next_latency][i])
+                    {
+                        // ADD is implemented as two 1-cycle instructions on a real CPU, so do an additional availability check
+                        if ((opcode == ADD) && alu_busy[next_latency + 1][i])
+                        {
+                            continue;
+                        }
+
+                        // Rotation can only start when previous rotation is finished, so do an additional availability check
+                        if (is_rotation[opcode] && (next_latency < rotate_count * op_latency[opcode]))
+                        {
+                            continue;
+                        }
+
+                        alu_index = i;
+                        break;
+                    }
+                }
+                if (alu_index >= 0)
+                {
+                    break;
+                }
+                ++next_latency;
+            }
+
+            // Don't generate instructions that leave some register unchanged for more than 7 cycles
+            if (next_latency > latency[a] + 7)
+            {
+                continue;
+            }
+
+            next_latency += op_latency[opcode];
+
+            if (next_latency <= TOTAL_LATENCY)
+            {
+                if (is_rotation[opcode])
+                {
+                    ++rotate_count;
+                }
+
+                // Mark ALU as busy only for the first cycle when it starts executing the instruction because ALUs are fully pipelined
+                alu_busy[next_latency - op_latency[opcode]][alu_index] = true;
+                latency[a] = next_latency;
+
+                // ASIC is supposed to have enough ALUs to run as many independent instructions per cycle as possible, so latency calculation for ASIC is simple
+                asic_latency[a] = ((asic_latency[a] > asic_latency[b]) ? asic_latency[a] : asic_latency[b]) + asic_op_latency[opcode];
+
+                rotated[a] = is_rotation[opcode];
+
+                inst_data[a] = code_size + (opcode << 8) + ((inst_data[b] & 255) << 16);
+
+                code[code_size].opcode = opcode;
+                code[code_size].dst_index = dst_index;
+                code[code_size].src_index = src_index;
+                code[code_size].C = 0;
+
+                if (src_index == 8)
+                {
+                    r8_used = true;
+                }
+
+                if (opcode == ADD)
+                {
+                    // ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too
+                    alu_busy[next_latency - op_latency[opcode] + 1][alu_index] = true;
+
+                    // ADD instruction requires 4 more random bytes for 32-bit constant "C" in "a = a + b + C"
+                    check_data(&data_index, sizeof(uint32_t), data, sizeof(data));
+                    uint32_t t;
+                    memcpy(&t, data + data_index, sizeof(uint32_t));
+                    code[code_size].C = SWAP32LE(t);
+                    data_index += sizeof(uint32_t);
+                }
+
+                ++code_size;
+                if (code_size >= NUM_INSTRUCTIONS_MIN)
+                {
+                    break;
+                }
+            }
+            else
+            {
+                ++num_retries;
+            }
+        }
+
+        // ASIC has more execution resources and can extract as much parallelism from the code as possible
+        // We need to add a few more MUL and ROR instructions to achieve minimal required latency for ASIC
+        // Get this latency for at least 1 of the 4 registers
+        const int prev_code_size = code_size;
+        while ((code_size < NUM_INSTRUCTIONS_MAX) && (asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY))
+        {
+            int min_idx = 0;
+            int max_idx = 0;
+            for (int i = 1; i < 4; ++i)
+            {
+                if (asic_latency[i] < asic_latency[min_idx]) min_idx = i;
+                if (asic_latency[i] > asic_latency[max_idx]) max_idx = i;
+            }
+
+            const uint8_t pattern[3] = { ROR, MUL, MUL };
+            const uint8_t opcode = pattern[(code_size - prev_code_size) % 3];
+            latency[min_idx] = latency[max_idx] + op_latency[opcode];
+            asic_latency[min_idx] = asic_latency[max_idx] + asic_op_latency[opcode];
+
+            code[code_size].opcode = opcode;
+            code[code_size].dst_index = min_idx;
+            code[code_size].src_index = max_idx;
+            code[code_size].C = 0;
+            ++code_size;
+        }
+
+    // There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time
+    // It never does more than 4 iterations for all block heights < 10,000,000
+    }  while (!r8_used || (code_size < NUM_INSTRUCTIONS_MIN) || (code_size > NUM_INSTRUCTIONS_MAX));
+
+    // It's guaranteed that NUM_INSTRUCTIONS_MIN <= code_size <= NUM_INSTRUCTIONS_MAX here
+    // Add final instruction to stop the interpreter
+    code[code_size].opcode = RET;
+    code[code_size].dst_index = 0;
+    code[code_size].src_index = 0;
+    code[code_size].C = 0;
+
+    return code_size;
+}
+
+#endif
--- a/cmake/asm.cmake
+++ b/cmake/asm.cmake
@ -0,0 +1,27 @@
+if (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8)
+    set(XMRIG_ASM_LIBRARY "xmrig-asm")
+
+    enable_language(ASM)
+
+    if (WIN32 AND CMAKE_C_COMPILER_ID MATCHES GNU)
+        set(XMRIG_ASM_FILES
+            "crypto/asm/win64/cn_main_loop.S"
+            "crypto/asm/CryptonightR_template.S"
+        )
+    else()
+        set(XMRIG_ASM_FILES
+            "crypto/asm/cn_main_loop.S"
+            "crypto/asm/CryptonightR_template.S"
+        )
+    endif()
+
+    set_property(SOURCE ${XMRIG_ASM_FILES} PROPERTY C)
+
+    add_library(${XMRIG_ASM_LIBRARY} STATIC ${XMRIG_ASM_FILES})
+    set(XMRIG_ASM_SOURCES "crypto/CryptonightR_gen.c")
+    set_property(TARGET ${XMRIG_ASM_LIBRARY} PROPERTY LINKER_LANGUAGE C)
+else()
+    set(XMRIG_ASM_SOURCES "")
+    set(XMRIG_ASM_LIBRARY "")
+    add_definitions(/DXMRIG_NO_ASM)
+endif()
--- a/cpu.c
+++ b/cpu.c
@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -31,6 +32,7 @@
 #endif

 #include "cpu.h"
+#include "options.h"


 #ifndef BUILD_TEST
@ -68,6 +70,15 @@ void cpu_init_common() {
    if (data.flags[CPU_FEATURE_BMI2]) {
        cpu_info.flags |= CPU_FLAG_BMI2;
    }
+
+#   ifndef XMRIG_NO_ASM
+    if (data.vendor == VENDOR_AMD) {
+        cpu_info.assembly = (data.ext_family >= 23) ? ASM_RYZEN : ASM_BULLDOZER;
+    }
+    else if (data.vendor == VENDOR_INTEL) {
+        cpu_info.assembly = ASM_INTEL;
+    }
+#   endif
 }
 #endif

--- a/cpu.h
+++ b/cpu.h
@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -21,8 +22,8 @@
 *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef __CPU_H__
-#define __CPU_H__
+#ifndef XMRIG_CPU_H
+#define XMRIG_CPU_H

 #include <stdbool.h>

@ -34,6 +35,7 @@ struct cpu_info {
    int l2_cache;
    int l3_cache;
    char brand[64];
+    int assembly;
 };

 extern struct cpu_info cpu_info;
@ -50,4 +52,4 @@ void cpu_init();
 int get_optimal_threads_count(int algo, bool double_hash, int max_cpu_usage);
 int affine_to_cpu_mask(int id, unsigned long mask);

-#endif /* __CPU_H__ */
+#endif /* XMRIG_CPU_H */
--- a/cpu_stub.c
+++ b/cpu_stub.c
@ -24,7 +24,11 @@
 #include <cpuid.h>
 #include <string.h>
 #include <stdbool.h>
+#include <stdint.h>
+
+
 #include "cpu.h"
+#include "options.h"


 #define VENDOR_ID                  (0)
@ -53,7 +57,7 @@ static inline void cpuid(int level, int output[4]) {


 static void cpu_brand_string(char* s) {
-    int cpu_info[4] = { 0 };
+    int32_t cpu_info[4] = { 0 };
    cpuid(VENDOR_ID, cpu_info);

    if (cpu_info[EAX_Reg] >= 4) {
@ -68,7 +72,7 @@ static void cpu_brand_string(char* s) {

 static bool has_aes_ni()
 {
-    int cpu_info[4] = { 0 };
+    int32_t cpu_info[4] = { 0 };
    cpuid(PROCESSOR_INFO, cpu_info);

    return cpu_info[ECX_Reg] & bit_AES;
@ -76,7 +80,7 @@ static bool has_aes_ni()


 static bool has_bmi2() {
-    int cpu_info[4] = { 0 };
+    int32_t cpu_info[4] = { 0 };
    cpuid(EXTENDED_FEATURES, cpu_info);

    return cpu_info[EBX_Reg] & bit_BMI2;
@ -93,6 +97,24 @@ void cpu_init_common() {

    if (has_aes_ni()) {
        cpu_info.flags |= CPU_FLAG_AES;
+
+#       ifndef XMRIG_NO_ASM
+        char vendor[13] = { 0 };
+        int32_t data[4] = { 0 };
+
+        cpuid(0, data);
+
+        memcpy(vendor + 0, &data[1], 4);
+        memcpy(vendor + 4, &data[3], 4);
+        memcpy(vendor + 8, &data[2], 4);
+
+        if (memcmp(vendor, "GenuineIntel", 12) == 0) {
+            cpu_info.assembly = ASM_INTEL;
+        }
+        else if (memcmp(vendor, "AuthenticAMD", 12) == 0) {
+            cpu_info.assembly = ASM_RYZEN;
+        }
+#       endif
    }

    if (has_bmi2()) {
--- a/crypto/CryptonightR_gen.c
+++ b/crypto/CryptonightR_gen.c
@ -0,0 +1,146 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett <https://github.com/vtnerd>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <string.h>
+
+#include "algo/cryptonight/cryptonight_monero.h"
+#include "crypto/asm/CryptonightR_template.h"
+#include "persistent_memory.h"
+
+
+static inline void add_code(uint8_t **p, void (*p1)(), void (*p2)())
+{
+    const ptrdiff_t size = (const uint8_t*)(p2) - (const uint8_t*)(p1);
+    if (size > 0) {
+        memcpy(*p, (const void *) p1, size);
+        *p += size;
+    }
+}
+
+
+static inline void add_random_math(uint8_t **p, const struct V4_Instruction* code, int code_size, const void_func* instructions, const void_func* instructions_mov, bool is_64_bit, enum Assembly ASM)
+{
+    uint32_t prev_rot_src = (uint32_t)(-1);
+
+    for (int i = 0;; ++i) {
+        const struct V4_Instruction inst = code[i];
+        if (inst.opcode == RET) {
+            break;
+        }
+
+        uint8_t opcode = (inst.opcode == MUL) ? inst.opcode : (inst.opcode + 2);
+        uint8_t dst_index = inst.dst_index;
+        uint8_t src_index = inst.src_index;
+
+        const uint32_t a = inst.dst_index;
+        const uint32_t b = inst.src_index;
+        const uint8_t c = opcode | (dst_index << V4_OPCODE_BITS) | (((src_index == 8) ? dst_index : src_index) << (V4_OPCODE_BITS + V4_DST_INDEX_BITS));
+
+        switch (inst.opcode) {
+        case ROR:
+        case ROL:
+            if (b != prev_rot_src) {
+                prev_rot_src = b;
+                add_code(p, instructions_mov[c], instructions_mov[c + 1]);
+            }
+            break;
+        }
+
+        if (a == prev_rot_src) {
+            prev_rot_src = (uint32_t)(-1);
+        }
+
+        void_func begin = instructions[c];
+
+        if ((ASM = ASM_BULLDOZER) && (inst.opcode == MUL) && !is_64_bit) {
+            // AMD Bulldozer has latency 4 for 32-bit IMUL and 6 for 64-bit IMUL
+            // Always use 32-bit IMUL for AMD Bulldozer in 32-bit mode - skip prefix 0x48 and change 0x49 to 0x41
+            uint8_t* prefix = (uint8_t*) begin;
+
+            if (*prefix == 0x49) {
+                **p = 0x41;
+                *p += 1;
+            }
+
+            begin = (void_func)(prefix + 1);
+        }
+
+        add_code(p, begin, instructions[c + 1]);
+
+        if (inst.opcode == ADD) {
+            *(uint32_t*)(*p - sizeof(uint32_t) - (is_64_bit ? 3 : 0)) = inst.C;
+            if (is_64_bit) {
+                prev_rot_src = (uint32_t)(-1);
+            }
+        }
+    }
+}
+
+
+void v4_compile_code(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM)
+{
+    uint8_t* p0 = machine_code;
+    uint8_t* p  = p0;
+
+    add_code(&p, CryptonightR_template_part1, CryptonightR_template_part2);
+    add_random_math(&p, code, code_size, instructions, instructions_mov, false, ASM);
+    add_code(&p, CryptonightR_template_part2, CryptonightR_template_part3);
+    *(int*)(p - 4) = (int)((((const uint8_t*)CryptonightR_template_mainloop) - ((const uint8_t*)CryptonightR_template_part1)) - (p - p0));
+    add_code(&p, CryptonightR_template_part3, CryptonightR_template_end);
+
+    flush_instruction_cache(machine_code, p - p0);
+}
+
+
+void v4_compile_code_double(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM)
+{
+    uint8_t* p0 = (uint8_t*) machine_code;
+    uint8_t* p = p0;
+
+    add_code(&p, CryptonightR_template_double_part1, CryptonightR_template_double_part2);
+    add_random_math(&p, code, code_size, instructions, instructions_mov, false, ASM);
+    add_code(&p, CryptonightR_template_double_part2, CryptonightR_template_double_part3);
+    add_random_math(&p, code, code_size, instructions, instructions_mov, false, ASM);
+    add_code(&p, CryptonightR_template_double_part3, CryptonightR_template_double_part4);
+    *(int*)(p - 4) = (int)((((const uint8_t*)CryptonightR_template_double_mainloop) - ((const uint8_t*)CryptonightR_template_double_part1)) - (p - p0));
+    add_code(&p, CryptonightR_template_double_part4, CryptonightR_template_double_end);
+
+    flush_instruction_cache(machine_code, p - p0);
+}
+
+
+void v4_soft_aes_compile_code(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM)
+{
+    uint8_t* p0 = machine_code;
+    uint8_t* p  = p0;
+
+    add_code(&p, CryptonightR_soft_aes_template_part1, CryptonightR_soft_aes_template_part2);
+    add_random_math(&p, code, code_size, instructions, instructions_mov, false, ASM);
+    add_code(&p, CryptonightR_soft_aes_template_part2, CryptonightR_soft_aes_template_part3);
+    *(int*)(p - 4) = (int)((((const uint8_t*)CryptonightR_soft_aes_template_mainloop) - ((const uint8_t*)CryptonightR_soft_aes_template_part1)) - (p - p0));
+    add_code(&p, CryptonightR_soft_aes_template_part3, CryptonightR_soft_aes_template_end);
+
+    flush_instruction_cache(machine_code, p - p0);
+}
--- a/crypto/asm/CryptonightR_soft_aes_template.inc
+++ b/crypto/asm/CryptonightR_soft_aes_template.inc
@ -0,0 +1,279 @@
+PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part1)
+PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_mainloop)
+PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part2)
+PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part3)
+PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_end)
+
+ALIGN(64)
+FN_PREFIX(CryptonightR_soft_aes_template_part1):
+	mov	QWORD PTR [rsp+8], rcx
+	push	rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 232
+
+	mov	eax, [rcx+96]
+	mov	ebx, [rcx+100]
+	mov	esi, [rcx+104]
+	mov	edx, [rcx+108]
+	mov [rsp+144], eax
+	mov [rsp+148], ebx
+	mov [rsp+152], esi
+	mov [rsp+156], edx
+
+	mov	rax, QWORD PTR [rcx+48]
+	mov	r10, rcx
+	xor	rax, QWORD PTR [rcx+16]
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	mov	r9, QWORD PTR [rcx+40]
+	xor	r9, QWORD PTR [rcx+8]
+	movq	xmm4, rax
+	mov	rdx, QWORD PTR [rcx+56]
+	xor	rdx, QWORD PTR [rcx+24]
+	mov	r11, QWORD PTR [rcx+224]
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r10+72]
+	mov	rax, QWORD PTR [r10+80]
+	movq	xmm0, rdx
+	xor	rax, QWORD PTR [r10+64]
+
+	movaps	XMMWORD PTR [rsp+16], xmm6
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+48], xmm8
+	movaps	XMMWORD PTR [rsp+64], xmm9
+	movaps	XMMWORD PTR [rsp+80], xmm10
+	movaps	XMMWORD PTR [rsp+96], xmm11
+	movaps	XMMWORD PTR [rsp+112], xmm12
+	movaps	XMMWORD PTR [rsp+128], xmm13
+
+	movq	xmm5, rax
+
+	mov	rax, r8
+	punpcklqdq xmm4, xmm0
+	and	eax, 2097136
+	movq	xmm10, QWORD PTR [r10+96]
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [r10+104]
+	xorps	xmm9, xmm9
+	mov	QWORD PTR [rsp+328], rax
+	movq	xmm12, r11
+	mov	QWORD PTR [rsp+320], r9
+	punpcklqdq xmm5, xmm0
+	movq xmm13, rcx
+	mov r12d, 524288
+
+	ALIGN(64)
+FN_PREFIX(CryptonightR_soft_aes_template_mainloop):
+	movd xmm11, r12d
+	mov	r12, QWORD PTR [r10+272]
+	lea	r13, QWORD PTR [rax+r11]
+	mov	esi, DWORD PTR [r13]
+	movq	xmm0, r9
+	mov	r10d, DWORD PTR [r13+4]
+	movq	xmm7, r8
+	mov	ebp, DWORD PTR [r13+12]
+	mov	r14d, DWORD PTR [r13+8]
+	mov	rdx, QWORD PTR [rsp+328]
+	movzx	ecx, sil
+	shr	esi, 8
+	punpcklqdq xmm7, xmm0
+	mov	r15d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r10b
+	shr	r10d, 8
+	mov	edi, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r14b
+	shr	r14d, 8
+	mov	ebx, DWORD PTR [r12+rcx*4]
+	movzx	ecx, bpl
+	shr	ebp, 8
+	mov	r9d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r10b
+	shr	r10d, 8
+	xor	r15d, DWORD PTR [r12+rcx*4+1024]
+	movzx	ecx, r14b
+	shr	r14d, 8
+	mov	eax, r14d
+	shr	eax, 8
+	xor	edi, DWORD PTR [r12+rcx*4+1024]
+	add	eax, 256
+	movzx	ecx, bpl
+	shr	ebp, 8
+	xor	ebx, DWORD PTR [r12+rcx*4+1024]
+	movzx	ecx, sil
+	shr	esi, 8
+	xor	r9d, DWORD PTR [r12+rcx*4+1024]
+	add	r12, 2048
+	movzx	ecx, r10b
+	shr	r10d, 8
+	add	r10d, 256
+	mov	r11d, DWORD PTR [r12+rax*4]
+	xor	r11d, DWORD PTR [r12+rcx*4]
+	xor	r11d, r9d
+	movzx	ecx, sil
+	mov	r10d, DWORD PTR [r12+r10*4]
+	shr	esi, 8
+	add	esi, 256
+	xor	r10d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, bpl
+	xor	r10d, ebx
+	shr	ebp, 8
+	movd	xmm1, r11d
+	add	ebp, 256
+	movq	r11, xmm12
+	mov	r9d, DWORD PTR [r12+rcx*4]
+	xor	r9d, DWORD PTR [r12+rsi*4]
+	mov	eax, DWORD PTR [r12+rbp*4]
+	xor	r9d, edi
+	movzx	ecx, r14b
+	movd	xmm0, r10d
+	movd	xmm2, r9d
+	xor	eax, DWORD PTR [r12+rcx*4]
+	mov	rcx, rdx
+	xor	eax, r15d
+	punpckldq xmm2, xmm1
+	xor	rcx, 16
+	movd	xmm6, eax
+	mov	rax, rdx
+	punpckldq xmm6, xmm0
+	xor	rax, 32
+	punpckldq xmm6, xmm2
+	xor	rdx, 48
+	movdqu	xmm2, XMMWORD PTR [rcx+r11]
+	pxor xmm6, xmm2
+	pxor	xmm6, xmm7
+	paddq	xmm2, xmm4
+	movdqu	xmm1, XMMWORD PTR [rax+r11]
+	movdqu	xmm0, XMMWORD PTR [rdx+r11]
+	pxor xmm6, xmm1
+	pxor xmm6, xmm0
+	paddq	xmm0, xmm5
+	movdqu	XMMWORD PTR [rcx+r11], xmm0
+	movdqu	XMMWORD PTR [rax+r11], xmm2
+	movq rcx, xmm13
+	paddq	xmm1, xmm7
+	movdqu	XMMWORD PTR [rdx+r11], xmm1
+	movq	rdi, xmm6
+	mov	r10, rdi
+	and	r10d, 2097136
+	movdqa	xmm0, xmm6
+	pxor	xmm0, xmm4
+	movdqu	XMMWORD PTR [r13], xmm0
+
+	mov ebx, [rsp+144]
+	mov ebp, [rsp+152]
+	add ebx, [rsp+148]
+	add ebp, [rsp+156]
+	shl rbp, 32
+	or rbx, rbp
+
+	xor rbx, QWORD PTR [r10+r11]
+	lea	r14, QWORD PTR [r10+r11]
+	mov	rbp, QWORD PTR [r14+8]
+
+	mov [rsp+160], rbx
+	mov [rsp+168], rdi
+	mov [rsp+176], rbp
+	mov [rsp+184], r10
+	mov r10, rsp
+
+	mov ebx, [rsp+144]
+	mov esi, [rsp+148]
+	mov edi, [rsp+152]
+	mov ebp, [rsp+156]
+
+	movd esp, xmm7
+	movaps xmm0, xmm7
+	psrldq xmm0, 8
+	movd r15d, xmm0
+	movd eax, xmm4
+	movd edx, xmm5
+	movaps xmm0, xmm5
+	psrldq xmm0, 8
+	movd r9d, xmm0
+
+FN_PREFIX(CryptonightR_soft_aes_template_part2):
+	mov rsp, r10
+	mov [rsp+144], ebx
+	mov [rsp+148], esi
+	mov [rsp+152], edi
+	mov [rsp+156], ebp
+
+	mov edi, edi
+	shl rbp, 32
+	or rbp, rdi
+	xor r8, rbp
+
+	mov ebx, ebx
+	shl rsi, 32
+	or rsi, rbx
+	xor QWORD PTR [rsp+320], rsi
+
+	mov rbx, [rsp+160]
+	mov rdi, [rsp+168]
+	mov rbp, [rsp+176]
+	mov r10, [rsp+184]
+
+	mov	r9, r10
+	xor	r9, 16
+	mov	rcx, r10
+	xor	rcx, 32
+	xor	r10, 48
+	mov	rax, rbx
+	mul	rdi
+	movdqu	xmm2, XMMWORD PTR [r9+r11]
+	movdqu	xmm1, XMMWORD PTR [rcx+r11]
+	pxor xmm6, xmm2
+	pxor xmm6, xmm1
+	paddq	xmm1, xmm7
+	add	r8, rdx
+	movdqu	xmm0, XMMWORD PTR [r10+r11]
+	pxor xmm6, xmm0
+	paddq	xmm0, xmm5
+	paddq	xmm2, xmm4
+	movdqu	XMMWORD PTR [r9+r11], xmm0
+	movdqa	xmm5, xmm4
+	mov	r9, QWORD PTR [rsp+320]
+	movdqa	xmm4, xmm6
+	add	r9, rax
+	movdqu	XMMWORD PTR [rcx+r11], xmm2
+	movdqu	XMMWORD PTR [r10+r11], xmm1
+	mov	r10, QWORD PTR [rsp+304]
+	movd r12d, xmm11
+	mov	QWORD PTR [r14], r8
+	xor	r8, rbx
+	mov	rax, r8
+	mov	QWORD PTR [r14+8], r9
+	and	eax, 2097136
+	xor	r9, rbp
+	mov	QWORD PTR [rsp+320], r9
+	mov	QWORD PTR [rsp+328], rax
+	sub	r12d, 1
+	jne	FN_PREFIX(CryptonightR_soft_aes_template_mainloop)
+
+FN_PREFIX(CryptonightR_soft_aes_template_part3):
+	movaps	xmm6, XMMWORD PTR [rsp+16]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	movaps	xmm8, XMMWORD PTR [rsp+48]
+	movaps	xmm9, XMMWORD PTR [rsp+64]
+	movaps	xmm10, XMMWORD PTR [rsp+80]
+	movaps	xmm11, XMMWORD PTR [rsp+96]
+	movaps	xmm12, XMMWORD PTR [rsp+112]
+	movaps	xmm13, XMMWORD PTR [rsp+128]
+
+	add	rsp, 232
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	pop	rbx
+	ret
+FN_PREFIX(CryptonightR_soft_aes_template_end):
--- a/crypto/asm/CryptonightR_template.S
+++ b/crypto/asm/CryptonightR_template.S
--- a/crypto/asm/CryptonightR_template.h
+++ b/crypto/asm/CryptonightR_template.h
--- a/crypto/asm/CryptonightR_template.inc
+++ b/crypto/asm/CryptonightR_template.inc
@ -0,0 +1,531 @@
+PUBLIC FN_PREFIX(CryptonightR_template_part1)
+PUBLIC FN_PREFIX(CryptonightR_template_mainloop)
+PUBLIC FN_PREFIX(CryptonightR_template_part2)
+PUBLIC FN_PREFIX(CryptonightR_template_part3)
+PUBLIC FN_PREFIX(CryptonightR_template_end)
+PUBLIC FN_PREFIX(CryptonightR_template_double_part1)
+PUBLIC FN_PREFIX(CryptonightR_template_double_mainloop)
+PUBLIC FN_PREFIX(CryptonightR_template_double_part2)
+PUBLIC FN_PREFIX(CryptonightR_template_double_part3)
+PUBLIC FN_PREFIX(CryptonightR_template_double_part4)
+PUBLIC FN_PREFIX(CryptonightR_template_double_end)
+
+ALIGN(64)
+FN_PREFIX(CryptonightR_template_part1):
+	mov	QWORD PTR [rsp+16], rbx
+	mov	QWORD PTR [rsp+24], rbp
+	mov	QWORD PTR [rsp+32], rsi
+	push	r10
+	push	r11
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	push	rdi
+	sub	rsp, 64
+	mov	r12, rcx
+	mov	r8, QWORD PTR [r12+32]
+	mov	rdx, r12
+	xor	r8, QWORD PTR [r12]
+	mov	r15, QWORD PTR [r12+40]
+	mov	r9, r8
+	xor	r15, QWORD PTR [r12+8]
+	mov	r11, QWORD PTR [r12+224]
+	mov	r12, QWORD PTR [r12+56]
+	xor	r12, QWORD PTR [rdx+24]
+	mov	rax, QWORD PTR [rdx+48]
+	xor	rax, QWORD PTR [rdx+16]
+	movaps	XMMWORD PTR [rsp+48], xmm6
+	movq	xmm0, r12
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+16], xmm8
+	movaps	XMMWORD PTR [rsp], xmm9
+	mov	r12, QWORD PTR [rdx+88]
+	xor	r12, QWORD PTR [rdx+72]
+	movq	xmm6, rax
+	mov	rax, QWORD PTR [rdx+80]
+	xor	rax, QWORD PTR [rdx+64]
+	punpcklqdq xmm6, xmm0
+	and	r9d, 2097136
+	movq	xmm0, r12
+	movq	xmm7, rax
+	punpcklqdq xmm7, xmm0
+	mov r10d, r9d
+	movq	xmm9, rsp
+	mov rsp, r8
+	mov	r8d, 524288
+
+	mov	ebx, [rdx+96]
+	mov	esi, [rdx+100]
+	mov	edi, [rdx+104]
+	mov	ebp, [rdx+108]
+
+	ALIGN(64)
+FN_PREFIX(CryptonightR_template_mainloop):
+	movdqa	xmm5, XMMWORD PTR [r9+r11]
+	movq	xmm0, r15
+	movq	xmm4, rsp
+	punpcklqdq xmm4, xmm0
+	lea	rdx, QWORD PTR [r9+r11]
+
+	aesenc	xmm5, xmm4
+
+	mov	r13d, r9d
+	mov	eax, r9d
+	xor	r9d, 48
+	xor	r13d, 16
+	xor	eax, 32
+	movdqu	xmm0, XMMWORD PTR [r9+r11]
+	movaps xmm3, xmm0
+	movdqu	xmm2, XMMWORD PTR [r13+r11]
+	movdqu	xmm1, XMMWORD PTR [rax+r11]
+	pxor xmm0, xmm2
+	pxor xmm5, xmm1
+	pxor xmm5, xmm0
+
+	movq	r12, xmm5
+	movd	r10d, xmm5
+	and	r10d, 2097136
+
+	paddq	xmm3, xmm7
+	paddq	xmm2, xmm6
+	paddq	xmm1, xmm4
+	movdqu	XMMWORD PTR [r13+r11], xmm3
+	movdqu	XMMWORD PTR [rax+r11], xmm2
+	movdqu	XMMWORD PTR [r9+r11], xmm1
+
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm6
+	movdqu	XMMWORD PTR [rdx], xmm0
+
+	lea	r13d, [ebx+esi]
+	lea	edx, [edi+ebp]
+	shl rdx, 32
+	or	r13, rdx
+
+	movd eax, xmm6
+	movd edx, xmm7
+	pextrd r9d, xmm7, 2
+
+	xor	r13, QWORD PTR [r10+r11]
+	mov	r14, QWORD PTR [r10+r11+8]
+
+FN_PREFIX(CryptonightR_template_part2):
+	lea	rcx, [r10+r11]
+
+	mov eax, edi
+	mov edx, ebp
+	shl rdx, 32
+	or rax, rdx
+	xor rsp, rax
+
+	mov eax, ebx
+	mov edx, esi
+	shl rdx, 32
+	or rax, rdx
+	xor r15, rax
+
+	mov	rax, r13
+	mul	r12
+	add	r15, rax
+	add	rsp, rdx
+
+	mov	r9d, r10d
+	mov	r12d, r10d
+	xor	r9d, 16
+	xor	r12d, 32
+	xor	r10d, 48
+	movdqa	xmm1, XMMWORD PTR [r12+r11]
+	movaps xmm3, xmm1
+	movdqa	xmm2, XMMWORD PTR [r9+r11]
+	movdqa	xmm0, XMMWORD PTR [r10+r11]
+	pxor xmm1, xmm2
+	pxor xmm5, xmm0
+	pxor xmm5, xmm1
+	paddq	xmm3, xmm4
+	paddq	xmm2, xmm6
+	paddq	xmm0, xmm7
+	movdqu	XMMWORD PTR [r9+r11], xmm0
+	movdqu	XMMWORD PTR [r12+r11], xmm2
+	movdqu	XMMWORD PTR [r10+r11], xmm3
+
+	movdqa	xmm7, xmm6
+	mov	QWORD PTR [rcx], rsp
+	xor	rsp, r13
+	mov	r9d, esp
+	mov	QWORD PTR [rcx+8], r15
+	and	r9d, 2097136
+	xor	r15, r14
+	movdqa	xmm6, xmm5
+	dec	r8d
+	jnz	FN_PREFIX(CryptonightR_template_mainloop)
+
+FN_PREFIX(CryptonightR_template_part3):
+	movq	rsp, xmm9
+
+	mov	rbx, QWORD PTR [rsp+136]
+	mov	rbp, QWORD PTR [rsp+144]
+	mov	rsi, QWORD PTR [rsp+152]
+	movaps	xmm6, XMMWORD PTR [rsp+48]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	movaps	xmm8, XMMWORD PTR [rsp+16]
+	movaps	xmm9, XMMWORD PTR [rsp]
+	add	rsp, 64
+	pop	rdi
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	r11
+	pop	r10
+	ret	0
+FN_PREFIX(CryptonightR_template_end):
+
+ALIGN(64)
+FN_PREFIX(CryptonightR_template_double_part1):
+	mov	QWORD PTR [rsp+24], rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 320
+	mov	r14, QWORD PTR [rcx+32]
+	mov	r8, rcx
+	xor	r14, QWORD PTR [rcx]
+	mov	r12, QWORD PTR [rcx+40]
+	mov	ebx, r14d
+	mov	rsi, QWORD PTR [rcx+224]
+	and	ebx, 2097136
+	xor	r12, QWORD PTR [rcx+8]
+	mov	rcx, QWORD PTR [rcx+56]
+	xor	rcx, QWORD PTR [r8+24]
+	mov	rax, QWORD PTR [r8+48]
+	xor	rax, QWORD PTR [r8+16]
+	mov	r15, QWORD PTR [rdx+32]
+	xor	r15, QWORD PTR [rdx]
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [r8+88]
+	xor	rcx, QWORD PTR [r8+72]
+	mov	r13, QWORD PTR [rdx+40]
+	mov	rdi, QWORD PTR [rdx+224]
+	xor	r13, QWORD PTR [rdx+8]
+	movaps	XMMWORD PTR [rsp+160], xmm6
+	movaps	XMMWORD PTR [rsp+176], xmm7
+	movaps	XMMWORD PTR [rsp+192], xmm8
+	movaps	XMMWORD PTR [rsp+208], xmm9
+	movaps	XMMWORD PTR [rsp+224], xmm10
+	movaps	XMMWORD PTR [rsp+240], xmm11
+	movaps	XMMWORD PTR [rsp+256], xmm12
+	movaps	XMMWORD PTR [rsp+272], xmm13
+	movaps	XMMWORD PTR [rsp+288], xmm14
+	movaps	XMMWORD PTR [rsp+304], xmm15
+	movq	xmm7, rax
+	mov	rax, QWORD PTR [r8+80]
+	xor	rax, QWORD PTR [r8+64]
+
+	movaps xmm1, XMMWORD PTR [rdx+96]
+	movaps xmm2, XMMWORD PTR [r8+96]
+	movaps XMMWORD PTR [rsp], xmm1
+	movaps XMMWORD PTR [rsp+16], xmm2
+
+	mov	r8d, r15d
+	punpcklqdq xmm7, xmm0
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [rdx+56]
+	xor	rcx, QWORD PTR [rdx+24]
+	movq	xmm9, rax
+	mov	QWORD PTR [rsp+128], rsi
+	mov	rax, QWORD PTR [rdx+48]
+	xor	rax, QWORD PTR [rdx+16]
+	punpcklqdq xmm9, xmm0
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [rdx+88]
+	xor	rcx, QWORD PTR [rdx+72]
+	movq	xmm8, rax
+	mov	QWORD PTR [rsp+136], rdi
+	mov	rax, QWORD PTR [rdx+80]
+	xor	rax, QWORD PTR [rdx+64]
+	punpcklqdq xmm8, xmm0
+	and	r8d, 2097136
+	movq	xmm0, rcx
+	mov	r11d, 524288
+	movq	xmm10, rax
+	punpcklqdq xmm10, xmm0
+	
+	movq xmm14, QWORD PTR [rsp+128]
+	movq xmm15, QWORD PTR [rsp+136]
+
+	ALIGN(64)
+FN_PREFIX(CryptonightR_template_double_mainloop):
+	movdqu	xmm6, XMMWORD PTR [rbx+rsi]
+	movq	xmm0, r12
+	mov	ecx, ebx
+	movq	xmm3, r14
+	punpcklqdq xmm3, xmm0
+	xor	ebx, 16
+	aesenc	xmm6, xmm3
+	movq	xmm4, r15
+	movdqu	xmm0, XMMWORD PTR [rbx+rsi]
+	pxor	xmm6, xmm0
+	xor	ebx, 48
+	paddq	xmm0, xmm7
+	movdqu	xmm1, XMMWORD PTR [rbx+rsi]
+	pxor	xmm6, xmm1
+	movdqu	XMMWORD PTR [rbx+rsi], xmm0
+	paddq	xmm1, xmm3
+	xor	ebx, 16
+	mov	eax, ebx
+	xor	rax, 32
+	movdqu	xmm0, XMMWORD PTR [rbx+rsi]
+	pxor	xmm6, xmm0
+	movq	rdx, xmm6
+	movdqu	XMMWORD PTR [rbx+rsi], xmm1
+	paddq	xmm0, xmm9
+	movdqu	XMMWORD PTR [rax+rsi], xmm0
+	movdqa	xmm0, xmm6
+	pxor	xmm0, xmm7
+	movdqu	XMMWORD PTR [rcx+rsi], xmm0
+	mov	esi, edx
+	movdqu	xmm5, XMMWORD PTR [r8+rdi]
+	and	esi, 2097136
+	mov	ecx, r8d
+	movq	xmm0, r13
+	punpcklqdq xmm4, xmm0
+	xor	r8d, 16
+	aesenc	xmm5, xmm4
+	movdqu	xmm0, XMMWORD PTR [r8+rdi]
+	pxor	xmm5, xmm0
+	xor	r8d, 48
+	paddq	xmm0, xmm8
+	movdqu	xmm1, XMMWORD PTR [r8+rdi]
+	pxor	xmm5, xmm1
+	movdqu	XMMWORD PTR [r8+rdi], xmm0
+	paddq	xmm1, xmm4
+	xor	r8d, 16
+	mov	eax, r8d
+	xor	rax, 32
+	movdqu	xmm0, XMMWORD PTR [r8+rdi]
+	pxor	xmm5, xmm0
+	movdqu	XMMWORD PTR [r8+rdi], xmm1
+	paddq	xmm0, xmm10
+	movdqu	XMMWORD PTR [rax+rdi], xmm0
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm8
+	movdqu	XMMWORD PTR [rcx+rdi], xmm0
+	movq	rdi, xmm5
+	movq	rcx, xmm14
+	mov	ebp, edi
+	mov	r8, QWORD PTR [rcx+rsi]
+	mov	r10, QWORD PTR [rcx+rsi+8]
+	lea	r9, QWORD PTR [rcx+rsi]
+	xor	esi, 16
+
+	movq xmm0, rsp
+	movq xmm1, rsi
+	movq xmm2, rdi
+	movq xmm11, rbp
+	movq xmm12, r15
+	movq xmm13, rdx
+	mov [rsp+104], rcx
+	mov [rsp+112], r9
+
+	mov ebx, DWORD PTR [rsp+16]
+	mov esi, DWORD PTR [rsp+20]
+	mov edi, DWORD PTR [rsp+24]
+	mov ebp, DWORD PTR [rsp+28]
+
+	lea	eax, [ebx+esi]
+	lea	edx, [edi+ebp]
+	shl rdx, 32
+	or	rax, rdx
+	xor r8, rax
+
+	movd esp, xmm3
+	pextrd r15d, xmm3, 2
+	movd eax, xmm7
+	movd edx, xmm9
+	pextrd r9d, xmm9, 2
+
+FN_PREFIX(CryptonightR_template_double_part2):
+
+	mov eax, edi
+	mov edx, ebp
+	shl rdx, 32
+	or rax, rdx
+	xor r14, rax
+
+	mov eax, ebx
+	mov edx, esi
+	shl rdx, 32
+	or rax, rdx
+	xor r12, rax
+
+	movq rsp, xmm0
+	mov DWORD PTR [rsp+16], ebx
+	mov DWORD PTR [rsp+20], esi
+	mov DWORD PTR [rsp+24], edi
+	mov DWORD PTR [rsp+28], ebp
+
+	movq rsi, xmm1
+	movq rdi, xmm2
+	movq rbp, xmm11
+	movq r15, xmm12
+	movq rdx, xmm13
+	mov rcx, [rsp+104]
+	mov r9, [rsp+112]
+
+	mov rbx, r8
+	mov	rax, r8
+	mul	rdx
+	and	ebp, 2097136
+	mov	r8, rax
+	movdqu	xmm1, XMMWORD PTR [rcx+rsi]
+	pxor	xmm6, xmm1
+	xor	esi, 48
+	paddq	xmm1, xmm7
+	movdqu	xmm2, XMMWORD PTR [rsi+rcx]
+	pxor	xmm6, xmm2
+	paddq	xmm2, xmm3
+	movdqu	XMMWORD PTR [rsi+rcx], xmm1
+	xor	esi, 16
+	mov	eax, esi
+	mov	rsi, rcx
+	movdqu	xmm0, XMMWORD PTR [rax+rcx]
+	pxor	xmm6, xmm0
+	movdqu	XMMWORD PTR [rax+rcx], xmm2
+	paddq	xmm0, xmm9
+	add	r12, r8
+	xor	rax, 32
+	add	r14, rdx
+	movdqa	xmm9, xmm7
+	movdqa	xmm7, xmm6
+	movdqu	XMMWORD PTR [rax+rcx], xmm0
+	mov	QWORD PTR [r9+8], r12
+	xor	r12, r10
+	mov	QWORD PTR [r9], r14
+	movq rcx, xmm15
+	xor	r14, rbx
+	mov	r10d, ebp
+	mov	ebx, r14d
+	xor	ebp, 16
+	and	ebx, 2097136
+	mov	r8, QWORD PTR [r10+rcx]
+	mov	r9, QWORD PTR [r10+rcx+8]
+
+	movq xmm0, rsp
+	movq xmm1, rbx
+	movq xmm2, rsi
+	movq xmm11, rdi
+	movq xmm12, rbp
+	movq xmm13, r15
+	mov [rsp+104], rcx
+	mov [rsp+112], r9
+
+	mov ebx, DWORD PTR [rsp]
+	mov esi, DWORD PTR [rsp+4]
+	mov edi, DWORD PTR [rsp+8]
+	mov ebp, DWORD PTR [rsp+12]
+
+	lea	eax, [ebx+esi]
+	lea	edx, [edi+ebp]
+	shl rdx, 32
+	or	rax, rdx
+
+	xor r8, rax
+	movq xmm3, r8
+
+	movd esp, xmm4
+	pextrd r15d, xmm4, 2
+	movd eax, xmm8
+	movd edx, xmm10
+	pextrd r9d, xmm10, 2
+
+FN_PREFIX(CryptonightR_template_double_part3):
+
+	movq r15, xmm13
+
+	mov eax, edi
+	mov edx, ebp
+	shl rdx, 32
+	or rax, rdx
+	xor r15, rax
+
+	mov eax, ebx
+	mov edx, esi
+	shl rdx, 32
+	or rax, rdx
+	xor r13, rax
+
+	movq rsp, xmm0
+	mov DWORD PTR [rsp], ebx
+	mov DWORD PTR [rsp+4], esi
+	mov DWORD PTR [rsp+8], edi
+	mov DWORD PTR [rsp+12], ebp
+
+	movq rbx, xmm1
+	movq rsi, xmm2
+	movq rdi, xmm11
+	movq rbp, xmm12
+	mov rcx, [rsp+104]
+	mov r9, [rsp+112]
+
+	mov rax, r8
+	mul	rdi
+	mov	rdi, rcx
+	mov	r8, rax
+	movdqu	xmm1, XMMWORD PTR [rbp+rcx]
+	pxor xmm5, xmm1
+	xor	ebp, 48
+	paddq	xmm1, xmm8
+	add	r13, r8
+	movdqu	xmm2, XMMWORD PTR [rbp+rcx]
+	pxor xmm5, xmm2
+	add	r15, rdx
+	movdqu	XMMWORD PTR [rbp+rcx], xmm1
+	paddq	xmm2, xmm4
+	xor	ebp, 16
+	mov	eax, ebp
+	xor	rax, 32
+	movdqu	xmm0, XMMWORD PTR [rbp+rcx]
+	pxor xmm5, xmm0
+	movdqu	XMMWORD PTR [rbp+rcx], xmm2
+	paddq	xmm0, xmm10
+	movdqu	XMMWORD PTR [rax+rcx], xmm0
+	movq rax, xmm3
+	movdqa	xmm10, xmm8
+	mov	QWORD PTR [r10+rcx], r15
+	movdqa	xmm8, xmm5
+	xor	r15, rax
+	mov	QWORD PTR [r10+rcx+8], r13
+	mov	r8d, r15d
+	xor	r13, r9
+	and	r8d, 2097136
+	dec r11d
+	jnz	FN_PREFIX(CryptonightR_template_double_mainloop)
+
+FN_PREFIX(CryptonightR_template_double_part4):
+
+	mov	rbx, QWORD PTR [rsp+400]
+	movaps	xmm6, XMMWORD PTR [rsp+160]
+	movaps	xmm7, XMMWORD PTR [rsp+176]
+	movaps	xmm8, XMMWORD PTR [rsp+192]
+	movaps	xmm9, XMMWORD PTR [rsp+208]
+	movaps	xmm10, XMMWORD PTR [rsp+224]
+	movaps	xmm11, XMMWORD PTR [rsp+240]
+	movaps	xmm12, XMMWORD PTR [rsp+256]
+	movaps	xmm13, XMMWORD PTR [rsp+272]
+	movaps	xmm14, XMMWORD PTR [rsp+288]
+	movaps	xmm15, XMMWORD PTR [rsp+304]
+	add	rsp, 320
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	ret	0
+FN_PREFIX(CryptonightR_template_double_end):
--- a/crypto/asm/cn2/cnv2_double_main_loop_sandybridge.inc
+++ b/crypto/asm/cn2/cnv2_double_main_loop_sandybridge.inc
@ -0,0 +1,410 @@
+	mov	rax, rsp
+	push	rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 184
+
+	stmxcsr DWORD PTR [rsp+272]
+	mov DWORD PTR [rsp+276], 24448
+	ldmxcsr DWORD PTR [rsp+276]
+
+	mov	r13, QWORD PTR [rcx+224]
+	mov	r9, rdx
+	mov	r10, QWORD PTR [rcx+32]
+	mov	r8, rcx
+	xor	r10, QWORD PTR [rcx]
+	mov	r14d, 524288
+	mov	r11, QWORD PTR [rcx+40]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rsi, QWORD PTR [rdx+224]
+	mov	rdx, QWORD PTR [rcx+56]
+	xor	rdx, QWORD PTR [rcx+24]
+	mov	rdi, QWORD PTR [r9+32]
+	xor	rdi, QWORD PTR [r9]
+	mov	rbp, QWORD PTR [r9+40]
+	xor	rbp, QWORD PTR [r9+8]
+	movq	xmm0, rdx
+	movaps	XMMWORD PTR [rax-88], xmm6
+	movaps	XMMWORD PTR [rax-104], xmm7
+	movaps	XMMWORD PTR [rax-120], xmm8
+	movaps	XMMWORD PTR [rsp+112], xmm9
+	movaps	XMMWORD PTR [rsp+96], xmm10
+	movaps	XMMWORD PTR [rsp+80], xmm11
+	movaps	XMMWORD PTR [rsp+64], xmm12
+	movaps	XMMWORD PTR [rsp+48], xmm13
+	movaps	XMMWORD PTR [rsp+32], xmm14
+	movaps	XMMWORD PTR [rsp+16], xmm15
+	mov	rdx, r10
+	movq	xmm4, QWORD PTR [r8+96]
+	and	edx, 2097136
+	mov	rax, QWORD PTR [rcx+48]
+	xorps	xmm13, xmm13
+	xor	rax, QWORD PTR [rcx+16]
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r8+72]
+	movq	xmm5, QWORD PTR [r8+104]
+	movq	xmm7, rax
+
+	mov eax, 1
+	shl rax, 52
+	movq xmm14, rax
+	punpcklqdq xmm14, xmm14
+
+	mov eax, 1023
+	shl rax, 52
+	movq xmm12, rax
+	punpcklqdq xmm12, xmm12
+
+	mov	rax, QWORD PTR [r8+80]
+	xor	rax, QWORD PTR [r8+64]
+	punpcklqdq xmm7, xmm0
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [r9+56]
+	xor	rcx, QWORD PTR [r9+24]
+	movq	xmm3, rax
+	mov	rax, QWORD PTR [r9+48]
+	xor	rax, QWORD PTR [r9+16]
+	punpcklqdq xmm3, xmm0
+	movq	xmm0, rcx
+	mov	QWORD PTR [rsp], r13
+	mov	rcx, QWORD PTR [r9+88]
+	xor	rcx, QWORD PTR [r9+72]
+	movq	xmm6, rax
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	punpcklqdq xmm6, xmm0
+	movq	xmm0, rcx
+	mov	QWORD PTR [rsp+256], r10
+	mov	rcx, rdi
+	mov	QWORD PTR [rsp+264], r11
+	movq	xmm8, rax
+	and	ecx, 2097136
+	punpcklqdq xmm8, xmm0
+	movq	xmm0, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movq	xmm0, QWORD PTR [r9+104]
+	lea	r8, QWORD PTR [rcx+rsi]
+	movdqu	xmm11, XMMWORD PTR [r8]
+	punpcklqdq xmm5, xmm0
+	lea	r9, QWORD PTR [rdx+r13]
+	movdqu	xmm15, XMMWORD PTR [r9]
+
+	ALIGN(64)
+main_loop_double_sandybridge:
+	movdqu	xmm9, xmm15
+	mov eax, edx
+	mov ebx, edx
+	xor eax, 16
+	xor ebx, 32
+	xor edx, 48
+
+	movq	xmm0, r11
+	movq	xmm2, r10
+	punpcklqdq xmm2, xmm0
+	aesenc	xmm9, xmm2
+
+	movdqu	xmm0, XMMWORD PTR [rax+r13]
+	movdqu	xmm1, XMMWORD PTR [rbx+r13]
+	paddq	xmm0, xmm7
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [rbx+r13], xmm0
+	movdqu	xmm0, XMMWORD PTR [rdx+r13]
+	movdqu	XMMWORD PTR [rdx+r13], xmm1
+	paddq	xmm0, xmm3
+	movdqu	XMMWORD PTR [rax+r13], xmm0
+
+	movq	r11, xmm9
+	mov	edx, r11d
+	and	edx, 2097136
+	movdqa	xmm0, xmm9
+	pxor	xmm0, xmm7
+	movdqu	XMMWORD PTR [r9], xmm0
+
+	lea	rbx, QWORD PTR [rdx+r13]
+	mov	r10, QWORD PTR [rdx+r13]
+
+	movdqu	xmm10, xmm11
+	movq	xmm0, rbp
+	movq	xmm11, rdi
+	punpcklqdq xmm11, xmm0
+	aesenc	xmm10, xmm11
+
+	mov eax, ecx
+	mov r12d, ecx
+	xor eax, 16
+	xor r12d, 32
+	xor ecx, 48
+
+	movdqu	xmm0, XMMWORD PTR [rax+rsi]
+	paddq	xmm0, xmm6
+	movdqu	xmm1, XMMWORD PTR [r12+rsi]
+	movdqu	XMMWORD PTR [r12+rsi], xmm0
+	paddq	xmm1, xmm11
+	movdqu	xmm0, XMMWORD PTR [rcx+rsi]
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	paddq	xmm0, xmm8
+	movdqu	XMMWORD PTR [rax+rsi], xmm0
+
+	movq	rcx, xmm10
+	and	ecx, 2097136
+
+	movdqa	xmm0, xmm10
+	pxor	xmm0, xmm6
+	movdqu	XMMWORD PTR [r8], xmm0
+	mov r12, QWORD PTR [rcx+rsi]
+
+	mov	r9, QWORD PTR [rbx+8]
+
+	xor edx, 16
+	mov r8d, edx
+	mov r15d, edx
+
+	movq	rdx, xmm5
+	shl	rdx, 32
+	movq	rax, xmm4
+	xor	rdx, rax
+	xor	r10, rdx
+	mov	rax, r10
+	mul	r11
+	mov r11d, r8d
+	xor r11d, 48
+	movq xmm0, rdx
+	xor rdx, [r11+r13]
+	movq xmm1, rax
+	xor rax, [r11+r13+8]
+	punpcklqdq xmm0, xmm1
+
+	pxor xmm0, XMMWORD PTR [r8+r13]
+	xor	r8d, 32
+	movdqu	xmm1, XMMWORD PTR [r11+r13]
+	paddq	xmm0, xmm7
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [r11+r13], xmm0
+	movdqu	xmm0, XMMWORD PTR [r8+r13]
+	movdqu	XMMWORD PTR [r8+r13], xmm1
+	paddq	xmm0, xmm3
+	movdqu	XMMWORD PTR [r15+r13], xmm0
+
+	mov	r11, QWORD PTR [rsp+256]
+	add	r11, rdx
+	mov	rdx, QWORD PTR [rsp+264]
+	add	rdx, rax
+	mov	QWORD PTR [rbx], r11
+	xor	r11, r10
+	mov	QWORD PTR [rbx+8], rdx
+	xor	rdx, r9
+	mov	QWORD PTR [rsp+256], r11
+	and	r11d, 2097136
+	mov	QWORD PTR [rsp+264], rdx
+	mov	QWORD PTR [rsp+8], r11
+	lea	r15, QWORD PTR [r11+r13]
+	movdqu xmm15, XMMWORD PTR [r11+r13]
+	lea	r13, QWORD PTR [rsi+rcx]
+	movdqa	xmm0, xmm5
+	psrldq	xmm0, 8
+	movaps	xmm2, xmm13
+	movq	r10, xmm0
+	psllq	xmm5, 1
+	shl	r10, 32
+	movdqa	xmm0, xmm9
+	psrldq	xmm0, 8
+	movdqa	xmm1, xmm10
+	movq	r11, xmm0
+	psrldq	xmm1, 8
+	movq	r8, xmm1
+	psrldq	xmm4, 8
+	movaps	xmm0, xmm13
+	movq	rax, xmm4
+	xor	r10, rax
+	movaps	xmm1, xmm13
+	xor	r10, r12
+	lea	rax, QWORD PTR [r11+1]
+	shr	rax, 1
+	movdqa	xmm3, xmm9
+	punpcklqdq xmm3, xmm10
+	paddq	xmm5, xmm3
+	movq	rdx, xmm5
+	psrldq	xmm5, 8
+	cvtsi2sd xmm2, rax
+	or	edx, -2147483647
+	lea	rax, QWORD PTR [r8+1]
+	shr	rax, 1
+	movq	r9, xmm5
+	cvtsi2sd xmm0, rax
+	or	r9d, -2147483647
+	cvtsi2sd xmm1, rdx
+	unpcklpd xmm2, xmm0
+	movaps	xmm0, xmm13
+	cvtsi2sd xmm0, r9
+	unpcklpd xmm1, xmm0
+	divpd	xmm2, xmm1
+	paddq	xmm2, xmm14
+	cvttsd2si rax, xmm2
+	psrldq	xmm2, 8
+	mov	rbx, rax
+	imul	rax, rdx
+	sub	r11, rax
+	js	div_fix_1_sandybridge
+div_fix_1_ret_sandybridge:
+
+	cvttsd2si rdx, xmm2
+	mov	rax, rdx
+	imul	rax, r9
+	movd	xmm2, r11d
+	movd	xmm4, ebx
+	sub	r8, rax
+	js	div_fix_2_sandybridge
+div_fix_2_ret_sandybridge:
+
+	movd	xmm1, r8d
+	movd	xmm0, edx
+	punpckldq xmm2, xmm1
+	punpckldq xmm4, xmm0
+	punpckldq xmm4, xmm2
+	paddq	xmm3, xmm4
+	movdqa	xmm0, xmm3
+	psrlq	xmm0, 12
+	paddq	xmm0, xmm12
+	sqrtpd	xmm1, xmm0
+	movq	r9, xmm1
+	movdqa xmm5, xmm1
+	psrlq xmm5, 19
+	test	r9, 524287
+	je	sqrt_fix_1_sandybridge
+sqrt_fix_1_ret_sandybridge:
+
+	movq r9, xmm10
+	psrldq	xmm1, 8
+	movq	r8, xmm1
+	test	r8, 524287
+	je	sqrt_fix_2_sandybridge
+sqrt_fix_2_ret_sandybridge:
+
+	mov r12d, ecx
+	mov r8d, ecx
+	xor r12d, 16
+	xor r8d, 32
+	xor ecx, 48
+	mov	rax, r10
+	mul	r9
+	movq xmm0, rax
+	movq xmm3, rdx
+	punpcklqdq xmm3, xmm0
+
+	movdqu	xmm0, XMMWORD PTR [r12+rsi]
+	pxor xmm0, xmm3
+	movdqu	xmm1, XMMWORD PTR [r8+rsi]
+	xor rdx, [r8+rsi]
+	xor rax, [r8+rsi+8]
+	movdqu	xmm3, XMMWORD PTR [rcx+rsi]
+	paddq	xmm0, xmm6
+	paddq	xmm1, xmm11
+	paddq	xmm3, xmm8
+	movdqu	XMMWORD PTR [r8+rsi], xmm0
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	movdqu	XMMWORD PTR [r12+rsi], xmm3
+
+	add	rdi, rdx
+	mov	QWORD PTR [r13], rdi
+	xor	rdi, r10
+	mov	ecx, edi
+	and	ecx, 2097136
+	lea	r8, QWORD PTR [rcx+rsi]
+
+	mov rdx, QWORD PTR [r13+8]	
+	add	rbp, rax
+	mov	QWORD PTR [r13+8], rbp
+	movdqu xmm11, XMMWORD PTR [rcx+rsi]
+	xor	rbp, rdx
+	mov	r13, QWORD PTR [rsp]
+	movdqa	xmm3, xmm7
+	mov	rdx, QWORD PTR [rsp+8]
+	movdqa	xmm8, xmm6
+	mov	r10, QWORD PTR [rsp+256]
+	movdqa	xmm7, xmm9
+	mov	r11, QWORD PTR [rsp+264]
+	movdqa	xmm6, xmm10
+	mov	r9, r15
+	dec r14d
+	jne	main_loop_double_sandybridge
+
+	ldmxcsr DWORD PTR [rsp+272]
+	movaps	xmm13, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+184]
+	movaps	xmm6, XMMWORD PTR [r11-24]
+	movaps	xmm7, XMMWORD PTR [r11-40]
+	movaps	xmm8, XMMWORD PTR [r11-56]
+	movaps	xmm9, XMMWORD PTR [r11-72]
+	movaps	xmm10, XMMWORD PTR [r11-88]
+	movaps	xmm11, XMMWORD PTR [r11-104]
+	movaps	xmm12, XMMWORD PTR [r11-120]
+	movaps	xmm14, XMMWORD PTR [rsp+32]
+	movaps	xmm15, XMMWORD PTR [rsp+16]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	pop	rbx
+	jmp cnv2_double_mainloop_asm_sandybridge_endp
+
+div_fix_1_sandybridge:
+	dec	rbx
+	add	r11, rdx
+	jmp	div_fix_1_ret_sandybridge
+
+div_fix_2_sandybridge:
+	dec	rdx
+	add	r8, r9
+	jmp	div_fix_2_ret_sandybridge
+
+sqrt_fix_1_sandybridge:
+	movq	r8, xmm3
+	movdqa xmm0, xmm5
+	psrldq xmm0, 8
+	dec	r9
+	mov r11d, -1022
+	shl r11, 32
+	mov	rax, r9
+	shr	r9, 19
+	shr	rax, 20
+	mov	rdx, r9
+	sub	rdx, rax
+	lea	rdx, [rdx+r11+1]
+	add	rax, r11
+	imul	rdx, rax
+	sub	rdx, r8
+	adc	r9, 0
+	movq xmm5, r9
+	punpcklqdq xmm5, xmm0
+	jmp	sqrt_fix_1_ret_sandybridge
+
+sqrt_fix_2_sandybridge:
+	psrldq	xmm3, 8
+	movq	r11, xmm3
+	dec	r8
+	mov ebx, -1022
+	shl rbx, 32
+	mov	rax, r8
+	shr	r8, 19
+	shr	rax, 20
+	mov	rdx, r8
+	sub	rdx, rax
+	lea	rdx, [rdx+rbx+1]
+	add	rax, rbx
+	imul	rdx, rax
+	sub	rdx, r11
+	adc	r8, 0
+	movq xmm0, r8
+	punpcklqdq xmm5, xmm0
+	jmp	sqrt_fix_2_ret_sandybridge
+
+cnv2_double_mainloop_asm_sandybridge_endp:
--- a/crypto/asm/cn2/cnv2_main_loop_bulldozer.inc
+++ b/crypto/asm/cn2/cnv2_main_loop_bulldozer.inc
@ -0,0 +1,180 @@
+	mov	QWORD PTR [rsp+16], rbx
+	mov	QWORD PTR [rsp+24], rbp
+	mov	QWORD PTR [rsp+32], rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 64
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	rax, QWORD PTR [rcx+48]
+	mov	r9, rcx
+	xor	rax, QWORD PTR [rcx+16]
+	mov	ebp, 524288
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	mov	r11, QWORD PTR [rcx+40]
+	mov	r10, r8
+	mov	rdx, QWORD PTR [rcx+56]
+	movq	xmm3, rax
+	xor	rdx, QWORD PTR [rcx+24]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rbx, QWORD PTR [rcx+224]
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	movq	xmm0, rdx
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r9+72]
+	mov	rdi, QWORD PTR [r9+104]
+	and	r10d, 2097136
+	movaps	XMMWORD PTR [rsp+48], xmm6
+	movq	xmm4, rax
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+16], xmm8
+	xorps	xmm8, xmm8
+	mov ax, 1023
+	shl rax, 52
+	movq xmm7, rax
+	mov	r15, QWORD PTR [r9+96]
+	punpcklqdq xmm3, xmm0
+	movq	xmm0, rcx
+	punpcklqdq xmm4, xmm0
+
+	ALIGN(64)
+cnv2_main_loop_bulldozer:
+	movdqa	xmm5, XMMWORD PTR [r10+rbx]
+	movq xmm6, r8
+	pinsrq xmm6, r11, 1
+	lea	rdx, QWORD PTR [r10+rbx]
+	lea	r9, QWORD PTR [rdi+rdi]
+	shl	rdi, 32
+
+	mov	ecx, r10d
+	mov	eax, r10d
+	xor	ecx, 16
+	xor	eax, 32
+	xor	r10d, 48
+	aesenc	xmm5, xmm6
+	movdqa	xmm2, XMMWORD PTR [rcx+rbx]
+	movdqa	xmm1, XMMWORD PTR [rax+rbx]
+	movdqa	xmm0, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	paddq	xmm0, xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm0
+	movdqa	XMMWORD PTR [rax+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movaps	xmm1, xmm8
+	mov	rsi, r15
+	xor	rsi, rdi
+
+	mov edi, 1023
+	shl rdi, 52
+
+	movq	r14, xmm5
+	pextrq rax, xmm5, 1
+
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm3
+	mov	r10, r14
+	and	r10d, 2097136
+	movdqa	XMMWORD PTR [rdx], xmm0
+	xor	rsi, QWORD PTR [r10+rbx]
+	lea	r12, QWORD PTR [r10+rbx]
+	mov	r13, QWORD PTR [r10+rbx+8]
+
+	add	r9d, r14d
+	or	r9d, -2147483647
+	xor	edx, edx
+	div	r9
+	mov	eax, eax
+	shl	rdx, 32
+	lea	r15, [rax+rdx]
+	lea	rax, [r14+r15]
+	shr	rax, 12
+	add	rax, rdi
+	movq	xmm0, rax
+	sqrtsd	xmm1, xmm0
+	movq	rdi, xmm1
+	test	rdi, 524287
+	je	sqrt_fixup_bulldozer
+	shr	rdi, 19
+
+sqrt_fixup_bulldozer_ret:
+	mov	rax, rsi
+	mul	r14
+	movq xmm1, rax
+	movq xmm0, rdx
+	punpcklqdq xmm0, xmm1
+
+	mov	r9d, r10d
+	mov	ecx, r10d
+	xor	r9d, 16
+	xor	ecx, 32
+	xor	r10d, 48
+	movdqa	xmm1, XMMWORD PTR [rcx+rbx]
+	xor rdx, [rcx+rbx]
+	xor rax, [rcx+rbx+8]
+	movdqa	xmm2, XMMWORD PTR [r9+rbx]
+	pxor xmm2, xmm0
+	paddq xmm4, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	movdqa	XMMWORD PTR [r9+rbx], xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movdqa	xmm4, xmm3
+	add	r8, rdx
+	add	r11, rax
+	mov	QWORD PTR [r12], r8
+	xor	r8, rsi
+	mov	QWORD PTR [r12+8], r11
+	mov	r10, r8
+	xor	r11, r13
+	and	r10d, 2097136
+	movdqa	xmm3, xmm5
+	dec	ebp
+	jne	cnv2_main_loop_bulldozer
+
+	ldmxcsr DWORD PTR [rsp]
+	movaps	xmm6, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+64]
+	mov	rbx, QWORD PTR [r11+56]
+	mov	rbp, QWORD PTR [r11+64]
+	mov	rsi, QWORD PTR [r11+72]
+	movaps	xmm8, XMMWORD PTR [r11-48]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	jmp cnv2_main_loop_bulldozer_endp
+
+sqrt_fixup_bulldozer:
+	movq r9, xmm5
+	add r9, r15
+	dec	rdi
+	mov edx, -1022
+	shl rdx, 32
+	mov	rax, rdi
+	shr	rdi, 19
+	shr	rax, 20
+	mov	rcx, rdi
+	sub	rcx, rax
+	lea	rcx, [rcx+rdx+1]
+	add	rax, rdx
+	imul	rcx, rax
+	sub	rcx, r9
+	adc	rdi, 0
+	jmp	sqrt_fixup_bulldozer_ret
+
+cnv2_main_loop_bulldozer_endp:
--- a/crypto/asm/cn2/cnv2_main_loop_ivybridge.inc
+++ b/crypto/asm/cn2/cnv2_main_loop_ivybridge.inc
@ -0,0 +1,186 @@
+	mov	 QWORD PTR [rsp+24], rbx
+	push	 rbp
+	push	 rsi
+	push	 rdi
+	push	 r12
+	push	 r13
+	push	 r14
+	push	 r15
+	sub	 rsp, 80
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	 rax, QWORD PTR [rcx+48]
+	mov	 r9, rcx
+	xor	 rax, QWORD PTR [rcx+16]
+	mov	 esi, 524288
+	mov	 r8, QWORD PTR [rcx+32]
+	mov	 r13d, -2147483647
+	xor	 r8, QWORD PTR [rcx]
+	mov	 r11, QWORD PTR [rcx+40]
+	mov	 r10, r8
+	mov	 rdx, QWORD PTR [rcx+56]
+	movq	 xmm4, rax
+	xor	 rdx, QWORD PTR [rcx+24]
+	xor	 r11, QWORD PTR [rcx+8]
+	mov	 rbx, QWORD PTR [rcx+224]
+	mov	 rax, QWORD PTR [r9+80]
+	xor	 rax, QWORD PTR [r9+64]
+	movq	 xmm0, rdx
+	mov	 rcx, QWORD PTR [rcx+88]
+	xor	 rcx, QWORD PTR [r9+72]
+	movq	 xmm3, QWORD PTR [r9+104]
+	movaps	 XMMWORD PTR [rsp+64], xmm6
+	movaps	 XMMWORD PTR [rsp+48], xmm7
+	movaps	 XMMWORD PTR [rsp+32], xmm8
+	and	 r10d, 2097136
+	movq	 xmm5, rax
+
+	xor eax, eax
+	mov QWORD PTR [rsp+16], rax
+
+	mov ax, 1023
+	shl rax, 52
+	movq xmm8, rax
+	mov r15, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movq	 xmm0, rcx
+	punpcklqdq xmm5, xmm0
+	movdqu	 xmm6, XMMWORD PTR [r10+rbx]
+
+	ALIGN(64)
+main_loop_ivybridge:
+	lea	 rdx, QWORD PTR [r10+rbx]
+	mov	 ecx, r10d
+	mov	 eax, r10d
+	mov rdi, r15
+	xor	 ecx, 16
+	xor	 eax, 32
+	xor	 r10d, 48
+	movq	 xmm0, r11
+	movq	 xmm7, r8
+	punpcklqdq xmm7, xmm0
+	aesenc	 xmm6, xmm7
+	movq	 rbp, xmm6
+	mov	 r9, rbp
+	and	 r9d, 2097136
+	movdqu	 xmm2, XMMWORD PTR [rcx+rbx]
+	movdqu	 xmm1, XMMWORD PTR [rax+rbx]
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	paddq	 xmm1, xmm7
+	paddq	 xmm0, xmm5
+	paddq	 xmm2, xmm4
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm0
+	movdqu	 XMMWORD PTR [rax+rbx], xmm2
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	mov r10, r9
+	xor r10d, 32
+	movq	 rcx, xmm3
+	mov	 rax, rcx
+	shl	 rax, 32
+	xor	 rdi, rax
+	movdqa	 xmm0, xmm6
+	pxor	 xmm0, xmm4
+	movdqu	 XMMWORD PTR [rdx], xmm0
+	xor	 rdi, QWORD PTR [r9+rbx]
+	lea	 r14, QWORD PTR [r9+rbx]
+	mov	 r12, QWORD PTR [r14+8]
+	xor	 edx, edx
+	lea	 r9d, DWORD PTR [ecx+ecx]
+	add	 r9d, ebp
+	movdqa	 xmm0, xmm6
+	psrldq	 xmm0, 8
+	or	 r9d, r13d
+	movq	 rax, xmm0
+	div	 r9
+	xorps xmm3, xmm3
+	mov	 eax, eax
+	shl	 rdx, 32
+	add	 rdx, rax
+	lea	 r9, QWORD PTR [rdx+rbp]
+	mov r15, rdx
+	mov	 rax, r9
+	shr	 rax, 12
+	movq	 xmm0, rax
+	paddq	 xmm0, xmm8
+	sqrtsd	 xmm3, xmm0
+	psubq	 xmm3, XMMWORD PTR [rsp+16]
+	movq	 rdx, xmm3
+	test	 edx, 524287
+	je	 sqrt_fixup_ivybridge
+	psrlq	 xmm3, 19
+sqrt_fixup_ivybridge_ret:
+
+	mov	 ecx, r10d
+	mov	 rax, rdi
+	mul	 rbp
+	movq xmm2, rdx
+	xor rdx, [rcx+rbx]
+	add	 r8, rdx
+	mov	 QWORD PTR [r14], r8
+	xor	 r8, rdi
+	mov edi, r8d
+	and edi, 2097136
+	movq xmm0, rax
+	xor rax, [rcx+rbx+8]
+	add	 r11, rax
+	mov	 QWORD PTR [r14+8], r11
+	punpcklqdq xmm2, xmm0
+
+	mov	 r9d, r10d
+	xor	 r9d, 48
+	xor	 r10d, 16
+	pxor	 xmm2, XMMWORD PTR [r9+rbx]
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	paddq	 xmm0, xmm5
+	movdqu	 xmm1, XMMWORD PTR [rcx+rbx]
+	paddq	 xmm2, xmm4
+	paddq	 xmm1, xmm7
+	movdqa	 xmm5, xmm4
+	movdqu	 XMMWORD PTR [r9+rbx], xmm0
+	movdqa	 xmm4, xmm6
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm2
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	movdqu xmm6, [rdi+rbx]
+	mov	 r10d, edi
+	xor	 r11, r12
+	dec rsi
+	jne	 main_loop_ivybridge
+
+	ldmxcsr DWORD PTR [rsp]
+	mov	 rbx, QWORD PTR [rsp+160]
+	movaps	 xmm6, XMMWORD PTR [rsp+64]
+	movaps	 xmm7, XMMWORD PTR [rsp+48]
+	movaps	 xmm8, XMMWORD PTR [rsp+32]
+	add	 rsp, 80
+	pop	 r15
+	pop	 r14
+	pop	 r13
+	pop	 r12
+	pop	 rdi
+	pop	 rsi
+	pop	 rbp
+	jmp cnv2_main_loop_ivybridge_endp
+
+sqrt_fixup_ivybridge:
+	dec	 rdx
+	mov r13d, -1022
+	shl r13, 32
+	mov	 rax, rdx
+	shr	 rdx, 19
+	shr	 rax, 20
+	mov	 rcx, rdx
+	sub	 rcx, rax
+	add	 rax, r13
+	not r13
+	sub	 rcx, r13
+	mov	 r13d, -2147483647
+	imul	 rcx, rax
+	sub	 rcx, r9
+	adc	 rdx, 0
+	movq	 xmm3, rdx
+	jmp	 sqrt_fixup_ivybridge_ret
+
+cnv2_main_loop_ivybridge_endp:
--- a/crypto/asm/cn2/cnv2_main_loop_ryzen.inc
+++ b/crypto/asm/cn2/cnv2_main_loop_ryzen.inc
@ -0,0 +1,179 @@
+	mov	QWORD PTR [rsp+16], rbx
+	mov	QWORD PTR [rsp+24], rbp
+	mov	QWORD PTR [rsp+32], rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 64
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	rax, QWORD PTR [rcx+48]
+	mov	r9, rcx
+	xor	rax, QWORD PTR [rcx+16]
+	mov	ebp, 524288
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	mov	r11, QWORD PTR [rcx+40]
+	mov	r10, r8
+	mov	rdx, QWORD PTR [rcx+56]
+	movq	xmm3, rax
+	xor	rdx, QWORD PTR [rcx+24]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rbx, QWORD PTR [rcx+224]
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	movq	xmm0, rdx
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r9+72]
+	mov	rdi, QWORD PTR [r9+104]
+	and	r10d, 2097136
+	movaps	XMMWORD PTR [rsp+48], xmm6
+	movq	xmm4, rax
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+16], xmm8
+	xorps	xmm8, xmm8
+	mov ax, 1023
+	shl rax, 52
+	movq xmm7, rax
+	mov	r15, QWORD PTR [r9+96]
+	punpcklqdq xmm3, xmm0
+	movq	xmm0, rcx
+	punpcklqdq xmm4, xmm0
+
+	ALIGN(64)
+main_loop_ryzen:
+	movdqa	xmm5, XMMWORD PTR [r10+rbx]
+	movq	xmm0, r11
+	movq	xmm6, r8
+	punpcklqdq xmm6, xmm0
+	lea	rdx, QWORD PTR [r10+rbx]
+	lea	r9, QWORD PTR [rdi+rdi]
+	shl	rdi, 32
+
+	mov	ecx, r10d
+	mov	eax, r10d
+	xor	ecx, 16
+	xor	eax, 32
+	xor	r10d, 48
+	aesenc	xmm5, xmm6
+	movdqa	xmm2, XMMWORD PTR [rcx+rbx]
+	movdqa	xmm1, XMMWORD PTR [rax+rbx]
+	movdqa	xmm0, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	paddq	xmm0, xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm0
+	movdqa	XMMWORD PTR [rax+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movaps	xmm1, xmm8
+	mov	rsi, r15
+	xor	rsi, rdi
+	movq	r14, xmm5
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm3
+	mov	r10, r14
+	and	r10d, 2097136
+	movdqa	XMMWORD PTR [rdx], xmm0
+	xor	rsi, QWORD PTR [r10+rbx]
+	lea	r12, QWORD PTR [r10+rbx]
+	mov	r13, QWORD PTR [r10+rbx+8]
+
+	add	r9d, r14d
+	or	r9d, -2147483647
+	xor	edx, edx
+	movdqa	xmm0, xmm5
+	psrldq	xmm0, 8
+	movq	rax, xmm0
+
+	div	r9
+	movq xmm0, rax
+	movq xmm1, rdx
+	punpckldq xmm0, xmm1
+	movq r15, xmm0
+	paddq xmm0, xmm5
+	movdqa xmm2, xmm0
+	psrlq xmm0, 12
+	paddq	xmm0, xmm7
+	sqrtsd	xmm1, xmm0
+	movq	rdi, xmm1
+	test	rdi, 524287
+	je	sqrt_fixup_ryzen
+	shr	rdi, 19
+
+sqrt_fixup_ryzen_ret:
+	mov	rax, rsi
+	mul	r14
+	movq xmm1, rax
+	movq xmm0, rdx
+	punpcklqdq xmm0, xmm1
+
+	mov	r9d, r10d
+	mov	ecx, r10d
+	xor	r9d, 16
+	xor	ecx, 32
+	xor	r10d, 48
+	movdqa	xmm1, XMMWORD PTR [rcx+rbx]
+	xor rdx, [rcx+rbx]
+	xor rax, [rcx+rbx+8]
+	movdqa	xmm2, XMMWORD PTR [r9+rbx]
+	pxor xmm2, xmm0
+	paddq xmm4, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	movdqa	XMMWORD PTR [r9+rbx], xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movdqa	xmm4, xmm3
+	add	r8, rdx
+	add	r11, rax
+	mov	QWORD PTR [r12], r8
+	xor	r8, rsi
+	mov	QWORD PTR [r12+8], r11
+	mov	r10, r8
+	xor	r11, r13
+	and	r10d, 2097136
+	movdqa	xmm3, xmm5
+	dec	ebp
+	jne	main_loop_ryzen
+
+	ldmxcsr DWORD PTR [rsp]
+	movaps	xmm6, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+64]
+	mov	rbx, QWORD PTR [r11+56]
+	mov	rbp, QWORD PTR [r11+64]
+	mov	rsi, QWORD PTR [r11+72]
+	movaps	xmm8, XMMWORD PTR [r11-48]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	jmp cnv2_main_loop_ryzen_endp
+
+sqrt_fixup_ryzen:
+	movq r9, xmm2
+	dec	rdi
+	mov edx, -1022
+	shl rdx, 32
+	mov	rax, rdi
+	shr	rdi, 19
+	shr	rax, 20
+	mov	rcx, rdi
+	sub	rcx, rax
+	lea	rcx, [rcx+rdx+1]
+	add	rax, rdx
+	imul	rcx, rax
+	sub	rcx, r9
+	adc	rdi, 0
+	jmp	sqrt_fixup_ryzen_ret
+
+cnv2_main_loop_ryzen_endp:
--- a/crypto/asm/cn_main_loop.S
+++ b/crypto/asm/cn_main_loop.S
@ -0,0 +1,54 @@
+#ifdef __APPLE__
+#   define ALIGN(x) .align 6
+#else
+#   define ALIGN(x) .align 64
+#endif
+.intel_syntax noprefix
+#ifdef __APPLE__
+#   define FN_PREFIX(fn) _ ## fn
+.text
+#else
+#   define FN_PREFIX(fn) fn
+.section .text
+#endif
+.global FN_PREFIX(cnv2_mainloop_ivybridge_asm)
+.global FN_PREFIX(cnv2_mainloop_ryzen_asm)
+.global FN_PREFIX(cnv2_mainloop_bulldozer_asm)
+.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm)
+
+ALIGN(64)
+FN_PREFIX(cnv2_mainloop_ivybridge_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	#include "cn2/cnv2_main_loop_ivybridge.inc"
+	add rsp, 48
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
+FN_PREFIX(cnv2_mainloop_ryzen_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	#include "cn2/cnv2_main_loop_ryzen.inc"
+	add rsp, 48
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
+FN_PREFIX(cnv2_mainloop_bulldozer_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	#include "cn2/cnv2_main_loop_bulldozer.inc"
+	add rsp, 48
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
+FN_PREFIX(cnv2_double_mainloop_sandybridge_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	mov rdx, rsi
+	#include "cn2/cnv2_double_main_loop_sandybridge.inc"
+	add rsp, 48
+	ret 0
+	mov eax, 3735929054
--- a/crypto/asm/win64/cn_main_loop.S
+++ b/crypto/asm/win64/cn_main_loop.S
@ -0,0 +1,31 @@
+#define ALIGN(x) .align 64
+.intel_syntax noprefix
+.section .text
+.global cnv2_mainloop_ivybridge_asm
+.global cnv2_mainloop_ryzen_asm
+.global cnv2_mainloop_bulldozer_asm
+.global cnv2_double_mainloop_sandybridge_asm
+
+ALIGN(64)
+cnv2_mainloop_ivybridge_asm:
+	#include "../cn2/cnv2_main_loop_ivybridge.inc"
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
+cnv2_mainloop_ryzen_asm:
+	#include "../cn2/cnv2_main_loop_ryzen.inc"
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
+cnv2_mainloop_bulldozer_asm:
+	#include "../cn2/cnv2_main_loop_bulldozer.inc"
+	ret 0
+	mov eax, 3735929054
+
+ALIGN(64)
+cnv2_double_mainloop_sandybridge_asm:
+	#include "../cn2/cnv2_double_main_loop_sandybridge.inc"
+	ret 0
+	mov eax, 3735929054
--- a/crypto/soft_aes.c
+++ b/crypto/soft_aes.c
@ -1,212 +0,0 @@
-/*
-  * This program is free software: you can redistribute it and/or modify
-  * it under the terms of the GNU General Public License as published by
-  * the Free Software Foundation, either version 3 of the License, or
-  * any later version.
-  *
-  * This program is distributed in the hope that it will be useful,
-  * but WITHOUT ANY WARRANTY; without even the implied warranty of
-  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-  * GNU General Public License for more details.
-  *
-  * You should have received a copy of the GNU General Public License
-  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
-  *
-  * Additional permission under GNU GPL version 3 section 7
-  *
-  * If you modify this Program, or any covered work, by linking or combining
-  * it with OpenSSL (or a modified version of that library), containing parts
-  * covered by the terms of OpenSSL License and SSLeay License, the licensors
-  * of this Program grant you additional permission to convey the resulting work.
-  *
-  */
-
-/*
- * The orginal author of this AES implementation is Karl Malbrain.
- */
-
-#ifdef __GNUC__
-#include <x86intrin.h>
-#else
-#include <intrin.h>
-#endif // __GNUC__
-
-#include <inttypes.h>
-
-#define TABLE_ALIGN     32
-#define WPOLY           0x011b
-#define N_COLS          4
-#define AES_BLOCK_SIZE  16
-#define RC_LENGTH       (5 * (AES_BLOCK_SIZE / 4 - 2))
-
-#if defined(_MSC_VER)
-#define ALIGN __declspec(align(TABLE_ALIGN))
-#elif defined(__GNUC__)
-#define ALIGN __attribute__ ((aligned(16)))
-#else
-#define ALIGN
-#endif
-
-#define rf1(r,c) (r)
-#define word_in(x,c) (*((uint32_t*)(x)+(c)))
-#define word_out(x,c,v) (*((uint32_t*)(x)+(c)) = (v))
-
-#define s(x,c) x[c]
-#define si(y,x,c) (s(y,c) = word_in(x, c))
-#define so(y,x,c) word_out(y, c, s(x,c))
-#define state_in(y,x) si(y,x,0); si(y,x,1); si(y,x,2); si(y,x,3)
-#define state_out(y,x)  so(y,x,0); so(y,x,1); so(y,x,2); so(y,x,3)
-#define round(y,x,k) \
-y[0] = (k)[0]  ^ (t_fn[0][x[0] & 0xff] ^ t_fn[1][(x[1] >> 8) & 0xff] ^ t_fn[2][(x[2] >> 16) & 0xff] ^ t_fn[3][x[3] >> 24]); \
-y[1] = (k)[1]  ^ (t_fn[0][x[1] & 0xff] ^ t_fn[1][(x[2] >> 8) & 0xff] ^ t_fn[2][(x[3] >> 16) & 0xff] ^ t_fn[3][x[0] >> 24]); \
-y[2] = (k)[2]  ^ (t_fn[0][x[2] & 0xff] ^ t_fn[1][(x[3] >> 8) & 0xff] ^ t_fn[2][(x[0] >> 16) & 0xff] ^ t_fn[3][x[1] >> 24]); \
-y[3] = (k)[3]  ^ (t_fn[0][x[3] & 0xff] ^ t_fn[1][(x[0] >> 8) & 0xff] ^ t_fn[2][(x[1] >> 16) & 0xff] ^ t_fn[3][x[2] >> 24]);
-#define to_byte(x) ((x) & 0xff)
-#define bval(x,n) to_byte((x) >> (8 * (n)))
-
-#define fwd_var(x,r,c)\
- ( r == 0 ? ( c == 0 ? s(x,0) : c == 1 ? s(x,1) : c == 2 ? s(x,2) : s(x,3))\
- : r == 1 ? ( c == 0 ? s(x,1) : c == 1 ? s(x,2) : c == 2 ? s(x,3) : s(x,0))\
- : r == 2 ? ( c == 0 ? s(x,2) : c == 1 ? s(x,3) : c == 2 ? s(x,0) : s(x,1))\
- :          ( c == 0 ? s(x,3) : c == 1 ? s(x,0) : c == 2 ? s(x,1) : s(x,2)))
-
-#define fwd_rnd(y,x,k,c)  (s(y,c) = (k)[c] ^ four_tables(x,t_use(f,n),fwd_var,rf1,c))
-
-#define sb_data(w) {\
-    w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\
-    w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\
-    w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\
-    w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\
-    w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\
-    w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\
-    w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\
-    w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\
-    w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\
-    w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\
-    w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\
-    w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\
-    w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\
-    w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\
-    w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\
-    w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\
-    w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\
-    w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\
-    w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\
-    w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\
-    w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\
-    w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\
-    w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\
-    w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\
-    w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\
-    w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\
-    w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\
-    w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\
-    w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\
-    w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\
-    w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\
-    w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) }
-
-#define rc_data(w) {\
-    w(0x01), w(0x02), w(0x04), w(0x08), w(0x10),w(0x20), w(0x40), w(0x80),\
-    w(0x1b), w(0x36) }
-
-#define bytes2word(b0, b1, b2, b3) (((uint32_t)(b3) << 24) | \
-    ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | (b0))
-
-#define h0(x)   (x)
-#define w0(p)   bytes2word(p, 0, 0, 0)
-#define w1(p)   bytes2word(0, p, 0, 0)
-#define w2(p)   bytes2word(0, 0, p, 0)
-#define w3(p)   bytes2word(0, 0, 0, p)
-
-#define u0(p)   bytes2word(f2(p), p, p, f3(p))
-#define u1(p)   bytes2word(f3(p), f2(p), p, p)
-#define u2(p)   bytes2word(p, f3(p), f2(p), p)
-#define u3(p)   bytes2word(p, p, f3(p), f2(p))
-
-#define v0(p)   bytes2word(fe(p), f9(p), fd(p), fb(p))
-#define v1(p)   bytes2word(fb(p), fe(p), f9(p), fd(p))
-#define v2(p)   bytes2word(fd(p), fb(p), fe(p), f9(p))
-#define v3(p)   bytes2word(f9(p), fd(p), fb(p), fe(p))
-
-#define f2(x)   ((x<<1) ^ (((x>>7) & 1) * WPOLY))
-#define f4(x)   ((x<<2) ^ (((x>>6) & 1) * WPOLY) ^ (((x>>6) & 2) * WPOLY))
-#define f8(x)   ((x<<3) ^ (((x>>5) & 1) * WPOLY) ^ (((x>>5) & 2) * WPOLY) ^ (((x>>5) & 4) * WPOLY))
-#define f3(x)   (f2(x) ^ x)
-#define f9(x)   (f8(x) ^ x)
-#define fb(x)   (f8(x) ^ f2(x) ^ x)
-#define fd(x)   (f8(x) ^ f4(x) ^ x)
-#define fe(x)   (f8(x) ^ f4(x) ^ f2(x))
-
-#define t_dec(m,n) t_##m##n
-#define t_set(m,n) t_##m##n
-#define t_use(m,n) t_##m##n
-
-#define d_4(t,n,b,e,f,g,h) ALIGN const t n[4][256] = { b(e), b(f), b(g), b(h) }
-
-#define four_tables(x,tab,vf,rf,c) \
-    (tab[0][bval(vf(x,0,c),rf(0,c))] \
-    ^ tab[1][bval(vf(x,1,c),rf(1,c))] \
-    ^ tab[2][bval(vf(x,2,c),rf(2,c))] \
-    ^ tab[3][bval(vf(x,3,c),rf(3,c))])
-
-d_4(uint32_t, t_dec(f,n), sb_data, u0, u1, u2, u3);
-
-__m128i soft_aesenc(__m128i in, __m128i key)
-{
-    uint32_t x0, x1, x2, x3;
-    x0 = _mm_cvtsi128_si32(in);
-    x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55));
-    x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xAA));
-    x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xFF));
-
-    __m128i out = _mm_set_epi32(
-        (t_fn[0][x3 & 0xff] ^ t_fn[1][(x0 >> 8) & 0xff] ^ t_fn[2][(x1 >> 16) & 0xff] ^ t_fn[3][x2 >> 24]),
-        (t_fn[0][x2 & 0xff] ^ t_fn[1][(x3 >> 8) & 0xff] ^ t_fn[2][(x0 >> 16) & 0xff] ^ t_fn[3][x1 >> 24]),
-        (t_fn[0][x1 & 0xff] ^ t_fn[1][(x2 >> 8) & 0xff] ^ t_fn[2][(x3 >> 16) & 0xff] ^ t_fn[3][x0 >> 24]),
-        (t_fn[0][x0 & 0xff] ^ t_fn[1][(x1 >> 8) & 0xff] ^ t_fn[2][(x2 >> 16) & 0xff] ^ t_fn[3][x3 >> 24]));
-
-    return _mm_xor_si128(out, key);
-}
-
-uint8_t Sbox[256] = {       // forward s-box
-0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
-0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
-0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
-0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
-0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
-0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
-0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
-0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
-0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
-0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
-0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
-0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
-0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
-0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
-0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
-0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16};
-
-static inline void sub_word(uint8_t* key)
-{
-    key[0] = Sbox[key[0]];
-    key[1] = Sbox[key[1]];
-    key[2] = Sbox[key[2]];
-    key[3] = Sbox[key[3]];
-}
-
-#ifdef __clang__
-uint32_t _rotr(uint32_t value, uint32_t amount)
-{
-    return (value >> amount) | (value << ((32 - amount) & 31));
-}
-#endif
-
-__m128i soft_aeskeygenassist(__m128i key, uint8_t rcon)
-{
-    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
-    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
-    sub_word((uint8_t*)&X1);
-    sub_word((uint8_t*)&X3);
-    return _mm_set_epi32(_rotr(X3, 8) ^ rcon, X3,_rotr(X1, 8) ^ rcon, X1);
-}
--- a/crypto/soft_aes.h
+++ b/crypto/soft_aes.h
@ -0,0 +1,131 @@
+/*
+  * This program is free software: you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation, either version 3 of the License, or
+  * any later version.
+  *
+  * This program is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+  * GNU General Public License for more details.
+  *
+  * You should have received a copy of the GNU General Public License
+  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  *
+  * Additional permission under GNU GPL version 3 section 7
+  *
+  * If you modify this Program, or any covered work, by linking or combining
+  * it with OpenSSL (or a modified version of that library), containing parts
+  * covered by the terms of OpenSSL License and SSLeay License, the licensors
+  * of this Program grant you additional permission to convey the resulting work.
+  *
+  */
+
+/*
+ * Parts of this file are originally copyright (c) 2014-2017, The Monero Project
+ */
+#pragma once
+
+
+#if defined(XMRIG_ARM)
+#   include "crypto/SSE2NEON.h"
+#elif defined(__GNUC__)
+#   include <x86intrin.h>
+#else
+#   include <intrin.h>
+#endif
+
+#include <inttypes.h>
+
+
+#define saes_data(w) {\
+    w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\
+    w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\
+    w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\
+    w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\
+    w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\
+    w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\
+    w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\
+    w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\
+    w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\
+    w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\
+    w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\
+    w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\
+    w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\
+    w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\
+    w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\
+    w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\
+    w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\
+    w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\
+    w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\
+    w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\
+    w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\
+    w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\
+    w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\
+    w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\
+    w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\
+    w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\
+    w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\
+    w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\
+    w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\
+    w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\
+    w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\
+    w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) }
+
+#define SAES_WPOLY           0x011b
+
+#define saes_b2w(b0, b1, b2, b3) (((uint32_t)(b3) << 24) | \
+    ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | (b0))
+
+#define saes_f2(x)   ((x<<1) ^ (((x>>7) & 1) * SAES_WPOLY))
+#define saes_f3(x)   (saes_f2(x) ^ x)
+#define saes_h0(x)   (x)
+
+#define saes_u0(p)   saes_b2w(saes_f2(p),          p,          p, saes_f3(p))
+#define saes_u1(p)   saes_b2w(saes_f3(p), saes_f2(p),          p,          p)
+#define saes_u2(p)   saes_b2w(         p, saes_f3(p), saes_f2(p),          p)
+#define saes_u3(p)   saes_b2w(         p,          p, saes_f3(p), saes_f2(p))
+
+__attribute__((aligned(16))) const static uint32_t saes_table[4][256] = { saes_data(saes_u0), saes_data(saes_u1), saes_data(saes_u2), saes_data(saes_u3) };
+__attribute__((aligned(16))) const static uint8_t  saes_sbox[256] = saes_data(saes_h0);
+
+
+static inline __m128i soft_aesenc(__m128i in, __m128i key)
+{
+    uint32_t x0, x1, x2, x3;
+    x0 = _mm_cvtsi128_si32(in);
+    x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55));
+    x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xAA));
+    x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xFF));
+
+    __m128i out = _mm_set_epi32(
+        (saes_table[0][x3 & 0xff] ^ saes_table[1][(x0 >> 8) & 0xff] ^ saes_table[2][(x1 >> 16) & 0xff] ^ saes_table[3][x2 >> 24]),
+        (saes_table[0][x2 & 0xff] ^ saes_table[1][(x3 >> 8) & 0xff] ^ saes_table[2][(x0 >> 16) & 0xff] ^ saes_table[3][x1 >> 24]),
+        (saes_table[0][x1 & 0xff] ^ saes_table[1][(x2 >> 8) & 0xff] ^ saes_table[2][(x3 >> 16) & 0xff] ^ saes_table[3][x0 >> 24]),
+        (saes_table[0][x0 & 0xff] ^ saes_table[1][(x1 >> 8) & 0xff] ^ saes_table[2][(x2 >> 16) & 0xff] ^ saes_table[3][x3 >> 24]));
+
+    return _mm_xor_si128(out, key);
+}
+
+static inline uint32_t sub_word(uint32_t key)
+{
+    return (saes_sbox[key >> 24 ] << 24)   | 
+        (saes_sbox[(key >> 16) & 0xff] << 16 ) | 
+        (saes_sbox[(key >> 8)  & 0xff] << 8  ) | 
+         saes_sbox[key & 0xff];
+}
+
+#if defined(__clang__) || defined(XMRIG_ARM)
+static inline uint32_t _rotr(uint32_t value, uint32_t amount)
+{
+    return (value >> amount) | (value << ((32 - amount) & 31));
+}
+#endif
+
+
+static inline __m128i soft_aeskeygenassist(__m128i key, uint8_t rcon)
+{
+    const uint32_t X1 = sub_word(_mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55)));
+    const uint32_t X3 = sub_word(_mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF)));
+    return _mm_set_epi32(_rotr(X3, 8) ^ rcon, X3, _rotr(X1, 8) ^ rcon, X1);
+}
--- a/donate.h
+++ b/donate.h
@ -24,6 +24,6 @@
 #ifndef __DONATE_H__
 #define __DONATE_H__

-#define DONATE_LEVEL 5
+#define DONATE_LEVEL 0

 #endif /* __DONATE_H__ */
--- a/mac/memory_mac.c
+++ b/mac/memory_mac.c
@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -74,3 +75,21 @@ void persistent_memory_free() {
        _mm_free(persistent_memory);
    }
 }
+
+
+void *allocate_executable_memory(size_t size)
+{
+    return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANON, -1, 0);
+}
+
+
+void protect_executable_memory(void *p, size_t size)
+{
+    mprotect(p, size, PROT_READ | PROT_EXEC);
+}
+
+
+void flush_instruction_cache(void *p, size_t size)
+{
+    __builtin___clear_cache((char*) p, (char*)(p) + size);
+}
--- a/memory.c
+++ b/memory.c
@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -24,33 +25,15 @@
 #include <string.h>

 #include "persistent_memory.h"
-#include "algo/cryptonight/cryptonight.h"
 #include "options.h"

-static size_t offset = 0;
-
-
-#ifndef XMRIG_NO_AEON
-static void * create_persistent_ctx_lite(int thr_id) {
-    struct cryptonight_ctx *ctx = NULL;
-
-    if (!opt_double_hash) {
-        const size_t offset = MEMORY * (thr_id + 1);

-        ctx = (struct cryptonight_ctx *) &persistent_memory[offset + MEMORY_LITE];
-        ctx->memory = (uint8_t*) &persistent_memory[offset];
-        return ctx;
-    }
-
-    ctx = (struct cryptonight_ctx *) &persistent_memory[MEMORY - sizeof(struct cryptonight_ctx) * (thr_id + 1)];
-    ctx->memory = (uint8_t*) &persistent_memory[MEMORY * (thr_id + 1)];
-
-    return ctx;
-}
-#endif
+static size_t offset = 0;


 void * persistent_calloc(size_t num, size_t size) {
+    size += size % 16;
+
    void *mem = &persistent_memory[offset];
    offset += (num * size);

@ -60,17 +43,29 @@ void * persistent_calloc(size_t num, size_t size) {
 }


-void * create_persistent_ctx(int thr_id) {
-#   ifndef XMRIG_NO_AEON
-    if (opt_algo == ALGO_CRYPTONIGHT_LITE) {
-        return create_persistent_ctx_lite(thr_id);
-    }
-#   endif
+void init_cn_r(struct cryptonight_ctx *ctx)
+{
+    uint8_t *p = allocate_executable_memory(0x4000);
+
+    ctx->generated_code        = (cn_mainloop_fun_ms_abi) p;
+    ctx->generated_code_double = (cn_mainloop_double_fun_ms_abi)(p + 0x2000);
+    ctx->generated_code_height = ctx->generated_code_double_height = (uint64_t)(-1);
+    ctx->height                = 0;
+}
+

-    struct cryptonight_ctx *ctx = (struct cryptonight_ctx *) &persistent_memory[MEMORY - sizeof(struct cryptonight_ctx) * (thr_id + 1)];
+void create_cryptonight_ctx(struct cryptonight_ctx **ctx, int thr_id)
+{
+    const int ratio = (opt_double_hash && opt_algo == ALGO_CRYPTONIGHT) ? 2 : 1;
+    ctx[0]          = persistent_calloc(1, sizeof(struct cryptonight_ctx));
+    ctx[0]->memory  = &persistent_memory[MEMORY * (thr_id * ratio + 1)];

-    const int ratio = opt_double_hash ? 2 : 1;
-    ctx->memory = (uint8_t*) &persistent_memory[MEMORY * (thr_id * ratio + 1)];
+    init_cn_r(ctx[0]);

-    return ctx;
+    if (opt_double_hash) {
+        ctx[1]         = persistent_calloc(1, sizeof(struct cryptonight_ctx));
+        ctx[1]->memory = ctx[0]->memory + (opt_algo == ALGO_CRYPTONIGHT ? MEMORY : MEMORY_LITE);
+
+        init_cn_r(ctx[1]);
+    }
 }
--- a/options.c
+++ b/options.c
@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -38,7 +39,6 @@

 int64_t opt_affinity      = -1L;
 int     opt_n_threads     = 0;
-int     opt_algo_variant  = 0;
 int     opt_retries       = 5;
 int     opt_retry_pause   = 5;
 int     opt_donate_level  = DONATE_LEVEL;
@ -55,13 +55,43 @@ char    *opt_userpass     = NULL;
 char    *opt_user         = NULL;
 char    *opt_pass         = NULL;

-enum mining_algo opt_algo = ALGO_CRYPTONIGHT;
+enum Algo opt_algo         = ALGO_CRYPTONIGHT;
+enum Variant opt_variant   = VARIANT_AUTO;
+enum AlgoVariant opt_av    = AV_AUTO;
+enum Assembly opt_assembly = ASM_AUTO;
+
+
+struct AlgoData
+{
+    const char *name;
+    const char *shortName;
+    enum Algo algo;
+    enum Variant variant;
+};
+
+
+static struct AlgoData const algorithms[] = {
+    { "cryptonight",           "cn",           ALGO_CRYPTONIGHT,       VARIANT_AUTO },
+    { "cryptonight/0",         "cn/0",         ALGO_CRYPTONIGHT,       VARIANT_0    },
+    { "cryptonight/1",         "cn/1",         ALGO_CRYPTONIGHT,       VARIANT_1    },
+    { "cryptonight/2",         "cn/2",         ALGO_CRYPTONIGHT,       VARIANT_2    },
+    { "cryptonight/4",         "cn/4",         ALGO_CRYPTONIGHT,       VARIANT_4    },
+    { "cryptonight/r",         "cn/r",         ALGO_CRYPTONIGHT,       VARIANT_4    },
+
+#   ifndef XMRIG_NO_AEON
+    { "cryptonight-lite",      "cn-lite",      ALGO_CRYPTONIGHT_LITE,  VARIANT_AUTO },
+    { "cryptonight-light",     "cn-light",     ALGO_CRYPTONIGHT_LITE,  VARIANT_AUTO },
+    { "cryptonight-lite/0",    "cn-lite/0",    ALGO_CRYPTONIGHT_LITE,  VARIANT_0    },
+    { "cryptonight-lite/1",    "cn-lite/1",    ALGO_CRYPTONIGHT_LITE,  VARIANT_1    },
+#   endif
+};


 static char const usage[] = "\
 Usage: " APP_ID " [OPTIONS]\n\
 Options:\n\
  -a, --algo=ALGO       cryptonight (default) or cryptonight-lite\n\
+      --variant=N       cryptonight variant: 0-4\n\
  -o, --url=URL         URL of mining server\n\
  -b, --backup-url=URL  URL of backup mining server\n\
  -O, --userpass=U:P    username:password pair for mining server\n\
@ -97,7 +127,7 @@ static struct option const options[] = {
    { "cpu-affinity",  1, NULL, 1020 },
    { "donate-level",  1, NULL, 1003 },
    { "help",          0, NULL, 'h'  },
-    { "keepalive",     0, NULL ,'k'  },
+    { "keepalive",     0, NULL, 'k'  },
    { "max-cpu-usage", 1, NULL, 1004 },
    { "nicehash",      0, NULL, 1006 },
    { "no-color",      0, NULL, 1002 },
@ -110,25 +140,45 @@ static struct option const options[] = {
    { "user",          1, NULL, 'u'  },
    { "userpass",      1, NULL, 'O'  },
    { "version",       0, NULL, 'V'  },
-    { 0, 0, 0, 0 }
+    { "variant",       1, NULL, 1021 },
+    { "asm",           1, NULL, 1022 },
+    { NULL,            0, NULL, 0    }
 };


 static const char *algo_names[] = {
-    [ALGO_CRYPTONIGHT]      = "cryptonight",
+    "cryptonight",
 #   ifndef XMRIG_NO_AEON
-    [ALGO_CRYPTONIGHT_LITE] = "cryptonight-lite"
+    "cryptonight-lite"
 #   endif
 };


+static const char *variant_names[] = {
+    "auto",
+    "0",
+    "1",
+    "2",
+    "4"
+};
+
+
+static const char *asm_names[] = {
+    "none",
+    "auto",
+    "intel",
+    "ryzen",
+    "bulldozer"
+};
+
+
 #ifndef XMRIG_NO_AEON
 static int get_cryptonight_lite_variant(int variant) {
-    if (variant <= AEON_AV0_AUTO || variant >= AEON_AV_MAX) {
-        return (cpu_info.flags & CPU_FLAG_AES) ? AEON_AV2_AESNI_DOUBLE : AEON_AV4_SOFT_AES_DOUBLE;
+    if (variant <= AV_AUTO || variant >= AV_MAX) {
+        return (cpu_info.flags & CPU_FLAG_AES) ? AV_DOUBLE : AV_DOUBLE_SOFT;
    }

-    if (opt_safe && !(cpu_info.flags & CPU_FLAG_AES) && variant <= AEON_AV2_AESNI_DOUBLE) {
+    if (opt_safe && !(cpu_info.flags & CPU_FLAG_AES) && variant <= AV_DOUBLE) {
        return variant + 2;
    }

@ -144,11 +194,11 @@ static int get_algo_variant(int algo, int variant) {
    }
 #   endif

-    if (variant <= XMR_AV0_AUTO || variant >= XMR_AV_MAX) {
-        return (cpu_info.flags & CPU_FLAG_AES) ? XMR_AV1_AESNI : XMR_AV3_SOFT_AES;
+    if (variant <= AV_AUTO || variant >= AV_MAX) {
+        return (cpu_info.flags & CPU_FLAG_AES) ? AV_SINGLE : AV_SINGLE_SOFT;
    }

-    if (opt_safe && !(cpu_info.flags & CPU_FLAG_AES) && variant <= XMR_AV2_AESNI_DOUBLE) {
+    if (opt_safe && !(cpu_info.flags & CPU_FLAG_AES) && variant <= AV_DOUBLE) {
        return variant + 2;
    }

@ -167,18 +217,21 @@ static void parse_arg(int key, char *arg) {

    switch (key)
    {
-    case 'a':
-        for (int i = 0; i < ARRAY_SIZE(algo_names); i++) {
-            if (algo_names[i] && !strcmp(arg, algo_names[i])) {
-                opt_algo = i;
+    case 'a': /* --algo */
+        for (size_t i = 0; i < ARRAY_SIZE(algorithms); i++) {
+            if ((strcasecmp(arg, algorithms[i].name) == 0) || (strcasecmp(arg, algorithms[i].shortName) == 0)) {
+                opt_algo    = algorithms[i].algo;
+                opt_variant = algorithms[i].variant;
                break;
            }
+        }
+        break;

-#           ifndef XMRIG_NO_AEON
-            if (i == ARRAY_SIZE(algo_names) - 1 && !strcmp(arg, "cryptonight-light")) {
-                opt_algo = i = ALGO_CRYPTONIGHT_LITE;
+    case 1022: /* --asm */
+        for (size_t i = 0; i < ARRAY_SIZE(asm_names); i++) {
+            if (strcasecmp(arg, asm_names[i]) == 0) {
+                opt_assembly = i;
            }
-#           endif
        }
        break;

@ -300,11 +353,11 @@ static void parse_arg(int key, char *arg) {

    case 'v': /* --av */
        v = atoi(arg);
-        if (v < 0 || v > 1000) {
+        if (v <= AV_AUTO || v >= AV_MAX) {
            show_usage_and_exit(1);
        }

-        opt_algo_variant = v;
+        opt_av = v;
        break;

    case 1020: /* --cpu-affinity */
@ -322,12 +375,23 @@ static void parse_arg(int key, char *arg) {
        break;

    case 1003: /* --donate-level */
+//        v = atoi(arg);
+//        if (v < 1 || v > 99) {
+//            show_usage_and_exit(1);
+//        }
+
+//        opt_donate_level = v;
+        break;
+
+    case 1021: /* --variant */
        v = atoi(arg);
-        if (v < 1 || v > 99) {
-            show_usage_and_exit(1);
+        if (v == 4 || strcasecmp(arg, "r") == 0) {
+            opt_variant = VARIANT_4;
+        }
+        else if (v > VARIANT_AUTO && v < VARIANT_MAX) {
+            opt_variant = v;
        }

-        opt_donate_level = v;
        break;

    case 1006: /* --nicehash */
@ -342,7 +406,7 @@ static void parse_arg(int key, char *arg) {

 static void parse_config(json_t *config, char *ref)
 {
-    int i;
+    size_t i;
    char buf[16];
    json_t *val;

@ -451,9 +515,9 @@ void parse_cmdline(int argc, char *argv[]) {
        sprintf(opt_userpass, "%s:%s", opt_user, opt_pass);
    }

-    opt_algo_variant = get_algo_variant(opt_algo, opt_algo_variant);
+    opt_av = get_algo_variant(opt_algo, opt_av);

-    if (!cryptonight_init(opt_algo_variant)) {
+    if (!cryptonight_init(opt_av)) {
        applog(LOG_ERR, "Cryptonight hash self-test failed. This might be caused by bad compiler optimizations.");
        proper_exit(1);
    }
@ -511,6 +575,12 @@ void show_version_and_exit(void) {
 }


-const char* get_current_algo_name(void) {
+const char *get_current_algo_name(void) {
    return algo_names[opt_algo];
 }
+
+
+const char *get_current_variant_name(void)
+{
+    return variant_names[opt_variant + 1];
+}
--- a/options.h
+++ b/options.h
@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -21,43 +22,52 @@
 *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef __OPTIONS_H__
-#define __OPTIONS_H__
+#ifndef XMRIG_OPTIONS_H
+#define XMRIG_OPTIONS_H

 #include <stdbool.h>
 #include <stdint.h>

+
 #ifndef ARRAY_SIZE
 #   define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
 #endif


-enum mining_algo {
+enum Algo {
    ALGO_CRYPTONIGHT,      /* CryptoNight (Monero) */
    ALGO_CRYPTONIGHT_LITE, /* CryptoNight-Lite (AEON) */
 };


-enum xmr_algo_variant {
-    XMR_AV0_AUTO,
-    XMR_AV1_AESNI,
-    XMR_AV2_AESNI_DOUBLE,
-    XMR_AV3_SOFT_AES,
-    XMR_AV4_SOFT_AES_DOUBLE,
-    XMR_AV_MAX
+enum Variant {
+    VARIANT_AUTO = -1,
+    VARIANT_0    = 0,
+    VARIANT_1    = 1,
+    VARIANT_2    = 2,
+    VARIANT_4    = 3,
+    VARIANT_MAX
 };


-#ifndef XMRIG_NO_AEON
-enum aeon_algo_variant {
-    AEON_AV0_AUTO,
-    AEON_AV1_AESNI,
-    AEON_AV2_AESNI_DOUBLE,
-    AEON_AV3_SOFT_AES,
-    AEON_AV4_SOFT_AES_DOUBLE,
-    AEON_AV_MAX
+enum AlgoVariant {
+    AV_AUTO,        // --av=0 Automatic mode.
+    AV_SINGLE,      // --av=1  Single hash mode
+    AV_DOUBLE,      // --av=2  Double hash mode
+    AV_SINGLE_SOFT, // --av=3  Single hash mode (Software AES)
+    AV_DOUBLE_SOFT, // --av=4  Double hash mode (Software AES)
+    AV_MAX
+};
+
+
+enum Assembly {
+    ASM_NONE,
+    ASM_AUTO,
+    ASM_INTEL,
+    ASM_RYZEN,
+    ASM_BULLDOZER,
+    ASM_MAX
 };
-#endif


 extern bool opt_colors;
@ -72,20 +82,24 @@ extern char *opt_userpass;
 extern char *opt_user;
 extern char *opt_pass;
 extern int opt_n_threads;
-extern int opt_algo_variant;
 extern int opt_retry_pause;
 extern int opt_retries;
 extern int opt_donate_level;
 extern int opt_max_cpu_usage;
 extern int64_t opt_affinity;
-extern enum mining_algo opt_algo;
+
+extern enum Algo opt_algo;
+extern enum Variant opt_variant;
+extern enum AlgoVariant opt_av;
+extern enum Assembly opt_assembly;

 void parse_cmdline(int argc, char *argv[]);
 void show_usage_and_exit(int status);
 void show_version_and_exit(void);
-const char* get_current_algo_name(void);
+const char *get_current_algo_name(void);
+const char *get_current_variant_name(void);

 extern void proper_exit(int reason);


-#endif /* __OPTIONS_H__ */
+#endif /* XMRIG_OPTIONS_H */
--- a/persistent_memory.h
+++ b/persistent_memory.h
@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -21,12 +22,16 @@
 *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef __PERSISTENT_MEMORY_H__
-#define __PERSISTENT_MEMORY_H__
+#ifndef XMRIG_PERSISTENT_MEMORY_H
+#define XMRIG_PERSISTENT_MEMORY_H
+

 #include <stddef.h>


+#include "algo/cryptonight/cryptonight.h"
+
+
 enum memory_flags {
    MEMORY_HUGEPAGES_AVAILABLE = 1,
    MEMORY_HUGEPAGES_ENABLED   = 2,
@ -43,8 +48,15 @@ extern int persistent_memory_flags;

 const char * persistent_memory_allocate();
 void persistent_memory_free();
-void * persistent_calloc(size_t num, size_t size);
-void * create_persistent_ctx(int thr_id);
+void *persistent_calloc(size_t num, size_t size);
+void create_cryptonight_ctx(struct cryptonight_ctx **ctx, int thr_id);
+
+
+void *allocate_executable_memory(size_t size);
+void flush_instruction_cache(void *p, size_t size);
+void init_cn_r(struct cryptonight_ctx *ctx);
+void protect_executable_memory(void *p, size_t size);
+


-#endif /* __PERSISTENT_MEMORY_H__ */
+#endif /* XMRIG_PERSISTENT_MEMORY_H */
--- a/stratum.c
+++ b/stratum.c
@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -40,11 +41,12 @@
 #   include <netinet/in.h>
 #endif

-#include "stratum.h"
-#include "version.h"
+#include "options.h"
 #include "stats.h"
+#include "stratum.h"
 #include "util.h"
 #include "utils/applog.h"
+#include "version.h"


 #ifdef WIN32
@ -73,6 +75,7 @@ static int sockopt_keepalive_cb(void *userdata, curl_socket_t fd, curlsocktype p
 static curl_socket_t opensocket_grab_cb(void *clientp, curlsocktype purpose, struct curl_sockaddr *addr);
 static int closesocket_cb(void *clientp, curl_socket_t item);
 static bool login_decode(struct stratum_ctx *sctx, const json_t *val);
+static void extensions_decode(const json_t *val);
 static bool job_decode(const json_t *job);
 static bool jobj_binary(const json_t *obj, const char *key, void *buf, size_t buflen);

@ -625,6 +628,11 @@ static bool login_decode(struct stratum_ctx *sctx, const json_t *val) {
    memcpy(&sctx->id, id, strlen(id));

    const char *s = json_string_value(json_object_get(res, "status"));
+    if (!s) {
+        // Workaround for xmrig-proxy bug https://github.com/xmrig/xmrig-proxy/commit/dfa1960fe3eeb13f80717b7dbfcc7c6e9f222d89
+        s = json_string_value(json_object_get(val, "status"));
+    }
+
    if (!s) {
        applog(LOG_ERR, "JSON invalid status");
        return false;
@ -635,10 +643,31 @@ static bool login_decode(struct stratum_ctx *sctx, const json_t *val) {
        return false;
    }

+    extensions_decode(res);
+
    return true;
 }


+static void extensions_decode(const json_t *res)
+{
+    json_t *extensions = json_object_get(res, "extensions");
+    if (!extensions || json_array_size(extensions) == 0) {
+        return;
+    }
+
+    size_t index;
+    json_t *value;
+
+    json_array_foreach(extensions, index, value) {
+        const char *s = json_string_value(value);
+        if (s && strcmp(s, "nicehash")) {
+            opt_nicehash = true;
+        }
+    }
+}
+
+
 /**
 * @brief job_decode
 * @param sctx
@ -681,6 +710,8 @@ static bool job_decode(const json_t *job) {
    memset(work.job_id, 0, sizeof(work.job_id));
    memcpy(work.job_id, job_id, strlen(job_id));

+    work.height = (uint64_t) json_integer_value(json_object_get(job, "height"));
+
    return true;
 }

--- a/stratum.h
+++ b/stratum.h
@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -21,8 +22,9 @@
 *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef __STRATUM_H__
-#define __STRATUM_H__
+#ifndef XMRIG_STRATUM_H
+#define XMRIG_STRATUM_H
+

 #include <stdbool.h>
 #include <inttypes.h>
@ -41,6 +43,7 @@ struct work {
    uint32_t target   __attribute__((aligned(16)));
    uint32_t hash[8]  __attribute__((aligned(16)));
    char job_id[64]   __attribute__((aligned(16)));
+    uint64_t height;
 };


@ -75,4 +78,4 @@ bool stratum_handle_method(struct stratum_ctx *sctx, const char *s);
 bool stratum_handle_response(char *buf);
 bool stratum_keepalived(struct stratum_ctx *sctx);

-#endif /* __STRATUM_H__ */
+#endif /* XMRIG_STRATUM_H */
--- a/unix/memory_unix.c
+++ b/unix/memory_unix.c
@ -74,3 +74,23 @@ void persistent_memory_free() {
        _mm_free(persistent_memory);
    }
 }
+
+
+void *allocate_executable_memory(size_t size)
+{
+    return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+}
+
+
+void protect_executable_memory(void *p, size_t size)
+{
+    mprotect(p, size, PROT_READ | PROT_EXEC);
+}
+
+
+void flush_instruction_cache(void *p, size_t size)
+{
+#   ifndef __FreeBSD__
+    __builtin___clear_cache((char*) p, (char*)(p) + size);
+#   endif
+}
--- a/utils/summary.c
+++ b/utils/summary.c
@ -77,10 +77,10 @@ static void print_threads() {
    }

    if (opt_colors) {
-        applog_notime(LOG_INFO, CL_LGR " * " CL_WHT "THREADS:      " CL_WHT "%d" CL_WHT ", av=%d, %s, donate=%d%%%s", opt_n_threads, opt_algo_variant, get_current_algo_name(), opt_donate_level, extra);
+        applog_notime(LOG_INFO, CL_LGR " * " CL_WHT "THREADS:      " CL_WHT "%d" CL_WHT ", av=%d, %s/%s, donate=%d%%%s", opt_n_threads, opt_av, get_current_algo_name(), get_current_variant_name(), opt_donate_level, extra);
    }
    else {
-        applog_notime(LOG_INFO, " * THREADS:      %d, av=%d, %s, donate=%d%%%s", opt_n_threads, opt_algo_variant, get_current_algo_name(), opt_donate_level, extra);
+        applog_notime(LOG_INFO, " * THREADS:      %d, av=%d, %s/%s, donate=%d%%%s", opt_n_threads, opt_av, get_current_algo_name(), get_current_variant_name(), opt_donate_level, extra);
    }
 }

--- a/version.h
+++ b/version.h
@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -21,20 +22,20 @@
 *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef __VERSION_H__
-#define __VERSION_H__
+#ifndef XMRIG_VERSION_H
+#define XMRIG_VERSION_H

 #define APP_ID        "xmrig"
 #define APP_NAME      "XMRig"
 #define APP_DESC      "Monero (XMR) CPU miner"
-#define APP_VERSION   "0.8.3-dev"
+#define APP_VERSION   "0.10.0-dev"
 #define APP_DOMAIN    "xmrig.com"
 #define APP_SITE      "www.xmrig.com"
-#define APP_COPYRIGHT "Copyright (C) 2016-2017 xmrig.com"
+#define APP_COPYRIGHT "Copyright (C) 2016-2019 xmrig.com"

 #define APP_VER_MAJOR  0
-#define APP_VER_MINOR  8
-#define APP_VER_BUILD  3
+#define APP_VER_MINOR  10
+#define APP_VER_BUILD  0
 #define APP_VER_REV    0

-#endif /* __VERSION_H__ */
+#endif /* XMRIG_VERSION_H */
--- a/win/memory_win.c
+++ b/win/memory_win.c
@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -21,9 +22,6 @@
 *   along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef __MEMORY_H__
-#define __MEMORY_H__
-
 #include <windows.h>
 #include <ntsecapi.h>
 #include <tchar.h>
@ -172,4 +170,21 @@ void persistent_memory_free() {
    }
 }

-#endif /* __MEMORY_H__ */
+
+void *allocate_executable_memory(size_t size)
+{
+    return VirtualAlloc(0, size, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE);
+}
+
+
+void protect_executable_memory(void *p, size_t size)
+{
+    DWORD oldProtect;
+    VirtualProtect(p, size, PAGE_EXECUTE_READ, &oldProtect);
+}
+
+
+void flush_instruction_cache(void *p, size_t size)
+{
+    FlushInstructionCache(GetCurrentProcess(), p, size);
+}
--- a/xmrig.c
+++ b/xmrig.c
@ -4,8 +4,9 @@
 * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
 * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
 * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
- * Copyright 2016-2017 XMRig       <support@xmrig.com>
- *
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2019 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2019 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
@ -260,7 +261,8 @@ static void *miner_thread(void *userdata) {
    uint32_t max_nonce;
    uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 0x20;

-    struct cryptonight_ctx *persistentctx = (struct cryptonight_ctx *) create_persistent_ctx(thr_id);
+    struct cryptonight_ctx *persistentctx[1];
+    create_cryptonight_ctx(persistentctx, thr_id);

    if (cpu_info.total_logical_cpus > 1 && opt_affinity != -1L) {
        affine_to_cpu_mask(thr_id, (unsigned long) opt_affinity);
@ -305,8 +307,10 @@ static void *miner_thread(void *userdata) {
        struct timeval tv_start;
        gettimeofday(&tv_start, NULL);

+        persistentctx[0]->height = work.height;
+
        /* scan nonces for a proof-of-work hash */
-        const int rc = scanhash_cryptonight(thr_id, hash, work.blob, work.blob_size, work.target, max_nonce, &hashes_done, persistentctx);
+        const int rc = scanhash_cryptonight(thr_id, hash, (uint8_t *) work.blob, work.blob_size, work.target, max_nonce, &hashes_done, persistentctx);
        stats_add_hashes(thr_id, &tv_start, hashes_done);

        if (!rc) {
@ -335,7 +339,8 @@ static void *miner_thread_double(void *userdata) {
    uint32_t max_nonce;
    uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 0x20;

-    struct cryptonight_ctx *persistentctx = (struct cryptonight_ctx *) create_persistent_ctx(thr_id);
+    struct cryptonight_ctx *persistentctx[2];
+    create_cryptonight_ctx(persistentctx, thr_id);

    if (cpu_info.total_logical_cpus > 1 && opt_affinity != -1L) {
        affine_to_cpu_mask(thr_id, (unsigned long) opt_affinity);
@ -389,6 +394,9 @@ static void *miner_thread_double(void *userdata) {
        struct timeval tv_start;
        gettimeofday(&tv_start, NULL);

+        persistentctx[0]->height = work.height;
+        persistentctx[1]->height = work.height;
+
        /* scan nonces for a proof-of-work hash */
        const int rc = scanhash_cryptonight_double(thr_id, (uint32_t *) double_hash, double_blob, work.blob_size, work.target, max_nonce, &hashes_done, persistentctx);
        stats_add_hashes(thr_id, &tv_start, hashes_done);
Author	SHA1	Message	Date
XMRig	d92c1a54de	Fixed macOS build.	5 years ago
XMRig	aa474fa51b	Fix compile warnings.	5 years ago
XMRig	7976059367	Add renaming ASM codes & update from upstream.	5 years ago
XMRig	c5cbd9d8fe	cn/r ASM support for --av 1.	5 years ago
XMRig	ef2e8bed6e	Use new style method to call ASM functions for cn/2 & added bulldozer ASM code.	5 years ago
XMRig	7574bfab60	Added self test for cn/r.	5 years ago
XMRig	27980f24f8	Plain C "cn/r" implementation.	5 years ago
XMRig	5e6a69e16f	Prepare for cn/r.	5 years ago
XMRig	69513e7049	Merge branch 'classic' into classic-dev	5 years ago
XMRig	b834c50aba	Merge branch 'classic-dev' into classic	6 years ago
xmrig	302ebe5a5b	Update CHANGELOG.md	6 years ago
XMRig	b9096f2392	Disable donation.	6 years ago
XMRig	b02f4ff163	Autodetect ASM without libcpuid.	6 years ago
XMRig	11748fad78	Add ASM code.	6 years ago
XMRig	e0dc51edf9	Fixed build without cn-lite.	6 years ago
XMRig	779238fc85	Add support for new style algorithm names.	6 years ago
XMRig	a06a224c0a	Implement --variant option.	6 years ago
XMRig	bf2eb1a685	Fix misaligned access.	6 years ago
XMRig	0bba8849f0	Fix Linux build.	6 years ago
XMRig	1e22a984af	Add double hash cn/2.	6 years ago
XMRig	61b49137c7	Add single hash cn/2.	6 years ago
XMRig	93d072ff6e	Massive refactoring, preparing for cn/2.	6 years ago
XMRig	f0b293f650	Add support for "nicehash" protocol extension.	6 years ago
XMRig	b93e7d9daa	Workaround for xmrig-proxy bug.	6 years ago
XMRig	0b4b07fcd6	v0.9.0-dev	6 years ago
XMRig	af62621169	Fix CURL detection.	6 years ago
XMRig	ed7260449a	v0.8.3	6 years ago
XMRig	33944595a2	Add Monero v7 support.	6 years ago