Browse Source

Merge branch 'classic-dev' into classic

pull/1618/head
XMRig 6 years ago
parent
commit
b834c50aba
  1. 9
      CHANGELOG.md
  2. 33
      CMakeLists.txt
  3. 2
      README.md
  4. 24
      algo/cryptonight-lite/cryptonight_lite_aesni.h
  5. 72
      algo/cryptonight-lite/cryptonight_lite_av1.c
  6. 108
      algo/cryptonight-lite/cryptonight_lite_av2.c
  7. 134
      algo/cryptonight-lite/cryptonight_lite_av3.c
  8. 78
      algo/cryptonight-lite/cryptonight_lite_av3_softaes.c
  9. 108
      algo/cryptonight-lite/cryptonight_lite_av4.c
  10. 19
      algo/cryptonight-lite/cryptonight_lite_softaes.h
  11. 264
      algo/cryptonight/cryptonight.c
  12. 28
      algo/cryptonight/cryptonight.h
  13. 24
      algo/cryptonight/cryptonight_aesni.h
  14. 247
      algo/cryptonight/cryptonight_av1.c
  15. 84
      algo/cryptonight/cryptonight_av1_aesni.c
  16. 304
      algo/cryptonight/cryptonight_av2.c
  17. 123
      algo/cryptonight/cryptonight_av2_aesni_double.c
  18. 193
      algo/cryptonight/cryptonight_av3.c
  19. 84
      algo/cryptonight/cryptonight_av3_softaes.c
  20. 304
      algo/cryptonight/cryptonight_av4.c
  21. 123
      algo/cryptonight/cryptonight_av4_softaes_double.c
  22. 89
      algo/cryptonight/cryptonight_monero.h
  23. 25
      algo/cryptonight/cryptonight_softaes.h
  24. 66
      algo/cryptonight/cryptonight_test.h
  25. 33
      cmake/asm.cmake
  26. 10
      cpu.c
  27. 7
      cpu.h
  28. 28
      cpu_stub.c
  29. 410
      crypto/asm/cnv2_double_main_loop_sandybridge.inc
  30. 37
      crypto/asm/cnv2_main_loop.S
  31. 25
      crypto/asm/cnv2_main_loop.asm
  32. 186
      crypto/asm/cnv2_main_loop_ivybridge.inc
  33. 179
      crypto/asm/cnv2_main_loop_ryzen.inc
  34. 21
      crypto/asm/win64/cnv2_main_loop.S
  35. 2
      donate.h
  36. 48
      memory.c
  37. 117
      options.c
  38. 57
      options.h
  39. 12
      persistent_memory.h
  40. 32
      stratum.c
  41. 7
      stratum.h
  42. 4
      utils/summary.c
  43. 12
      version.h
  44. 8
      xmrig.c

9
CHANGELOG.md

@ -1,3 +1,12 @@
# v0.9.0
- **[#753](https://github.com/xmrig/xmrig/issues/753) Added new algorithm [CryptoNight variant 2](https://github.com/xmrig/xmrig/issues/753) for Monero fork, thanks [@SChernykh](https://github.com/SChernykh).**
- Added option `--asm`, possible values `--asm auto`, `--asm none`, `--asm intel` and `--asm ryzen`.
- Added support for new style long and short algorithm names, possible values: `cryptonight`, `cryptonight/0`, `cryptonight/1`, `cryptonight/2`, `cryptonight-lite`, `cryptonight-lite/0`, `cryptonight-lite/1` and short equvalents `cn/2` etc.
- Added `--variant`, example `--algo cn --variant 2`, by default miner automaticaly detect proper variant for Monero by block version.
- Added CryptoNight-Lite variant 1.
- Added xmrig-proxy autodetection, nicehash will be enabled automaticaly.
- Added workaround for xmrig-proxy [bug](https://github.com/xmrig/xmrig-proxy/commit/dfa1960fe3eeb13f80717b7dbfcc7c6e9f222d89).
# v0.8.2
- Fixed L2 cache size detection for AMD CPUs (Bulldozer/Piledriver/Steamroller/Excavator architecture).
- Fixed gcc 7.1 support.

33
CMakeLists.txt

@ -3,6 +3,7 @@ project(xmrig C)
option(WITH_LIBCPUID "Use Libcpuid" ON)
option(WITH_AEON "CryptoNight-Lite support" ON)
option(WITH_ASM "Enable ASM PoW implementations" ON)
set(HEADERS
algo/cryptonight/cryptonight.h
@ -43,10 +44,10 @@ set(HEADERS_UTILS
set(SOURCES
xmrig.c
algo/cryptonight/cryptonight.c
algo/cryptonight/cryptonight_av1_aesni.c
algo/cryptonight/cryptonight_av2_aesni_double.c
algo/cryptonight/cryptonight_av3_softaes.c
algo/cryptonight/cryptonight_av4_softaes_double.c
algo/cryptonight/cryptonight_av1.c
algo/cryptonight/cryptonight_av2.c
algo/cryptonight/cryptonight_av3.c
algo/cryptonight/cryptonight_av4.c
util.c
options.c
stratum.c
@ -76,7 +77,7 @@ elseif (APPLE)
set(SOURCES_OS mac/cpu_mac.c mac/memory_mac.c mac/xmrig_mac.c)
else()
set(SOURCES_OS unix/cpu_unix.c unix/memory_unix.c unix/xmrig_unix.c)
set(EXTRA_LIBS pthread)
set(EXTRA_LIBS pthread rt m)
endif()
include_directories(.)
@ -91,9 +92,9 @@ endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes -Wno-pointer-to-int-cast")
if (CMAKE_C_COMPILER_ID MATCHES "Clang")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Ofast -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Ofast -s -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants")
else()
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Ofast -funroll-loops -fvariable-expansion-in-unroller -ftree-loop-if-convert-stores -fmerge-all-constants -fbranch-target-load-optimize2")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Ofast -s -funroll-loops -fvariable-expansion-in-unroller -ftree-loop-if-convert-stores -fmerge-all-constants -fbranch-target-load-optimize2")
endif()
#set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -gdwarf-2")
@ -125,12 +126,14 @@ else()
set(SOURCES_CPUID cpu_stub.c)
endif()
include(cmake/asm.cmake)
if (WITH_AEON)
set(SOURCES_AEON
algo/cryptonight-lite/cryptonight_lite_av1_aesni.c
algo/cryptonight-lite/cryptonight_lite_av2_aesni_double.c
algo/cryptonight-lite/cryptonight_lite_av3_softaes.c
algo/cryptonight-lite/cryptonight_lite_av4_softaes_double.c
algo/cryptonight-lite/cryptonight_lite_av1.c
algo/cryptonight-lite/cryptonight_lite_av2.c
algo/cryptonight-lite/cryptonight_lite_av3.c
algo/cryptonight-lite/cryptonight_lite_av4.c
algo/cryptonight-lite/cryptonight_lite_aesni.h
algo/cryptonight-lite/cryptonight_lite_softaes.h
)
@ -139,10 +142,10 @@ else()
endif()
if (CMAKE_SIZEOF_VOID_P EQUAL 8)
add_executable(xmrig ${HEADERS} ${HEADERS_CRYPTO} ${SOURCES} ${SOURCES_CRYPTO} ${HEADERS_UTILS} ${SOURCES_UTILS} ${HEADERS_COMPAT} ${SOURCES_COMPAT} ${SOURCES_OS} ${SOURCES_CPUID} ${SOURCES_AEON})
target_link_libraries(xmrig jansson curl ${CPUID_LIB} ${EXTRA_LIBS})
add_executable(xmrig ${HEADERS} ${HEADERS_CRYPTO} ${SOURCES} ${SOURCES_CRYPTO} ${HEADERS_UTILS} ${SOURCES_UTILS} ${HEADERS_COMPAT} ${SOURCES_COMPAT} ${SOURCES_OS} ${SOURCES_CPUID} ${SOURCES_AEON} ${XMRIG_ASM_SOURCES})
target_link_libraries(xmrig ${XMRIG_ASM_LIBRARY} jansson ${CURL_LIBRARY} ${CPUID_LIB} ${EXTRA_LIBS})
else()
add_executable(xmrig32 ${HEADERS} ${HEADERS_CRYPTO} ${SOURCES} ${SOURCES_CRYPTO} ${HEADERS_UTILS} ${SOURCES_UTILS} ${HEADERS_COMPAT} ${SOURCES_COMPAT} ${SOURCES_OS} ${SOURCES_CPUID} ${SOURCES_AEON})
target_link_libraries(xmrig32 jansson curl ${CPUID_LIB} ${EXTRA_LIBS})
add_executable(xmrig32 ${HEADERS} ${HEADERS_CRYPTO} ${SOURCES} ${SOURCES_CRYPTO} ${HEADERS_UTILS} ${SOURCES_UTILS} ${HEADERS_COMPAT} ${SOURCES_COMPAT} ${SOURCES_OS} ${SOURCES_CPUID} ${SOURCES_AEON} ${XMRIG_ASM_SOURCES})
target_link_libraries(xmrig32 ${XMRIG_ASM_LIBRARY} jansson ${CURL_LIBRARY} ${CPUID_LIB} ${EXTRA_LIBS})
endif()

2
README.md

@ -99,7 +99,7 @@ Configure options for libcurl:
```
CMake options:
```
cmake .. -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCURL_INCLUDE_DIR="c:\<path>\curl-7.53.1\include" -DCURL_LIBRARY="c:\<path>\curl-7.53.1\lib\.libs"
cmake .. -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCURL_INCLUDE_DIR="c:\xmrig-deps\gcc\x64\include" -DCURL_LIBRARY="c:\xmrig-deps\gcc\x64\lib\libcurl.a"
```
### Optional features

24
algo/cryptonight-lite/cryptonight_lite_aesni.h

@ -22,10 +22,12 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __CRYPTONIGHT_LITE_AESNI_H__
#define __CRYPTONIGHT_LITE_AESNI_H__
#ifndef XMRIG_CRYPTONIGHT_LITE_AESNI_H
#define XMRIG_CRYPTONIGHT_LITE_AESNI_H
#include <x86intrin.h>
#include <stdint.h>
#define aes_genkey_sub(imm8) \
@ -253,4 +255,20 @@ static inline uint64_t _umul128(uint64_t multiplier, uint64_t multiplicand, uint
#endif
#endif /* __CRYPTONIGHT_LITE_AESNI_H__ */
static inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
{
mem_out[0] = EXTRACT64(tmp);
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
uint64_t vh = EXTRACT64(tmp);
uint8_t x = vh >> 24;
static const uint16_t table = 0x7531;
const uint8_t index = (((x >> 3) & 6) | (x & 1)) << 1;
vh ^= ((table >> index) & 0x3) << 28;
mem_out[1] = vh;
}
#endif /* XMRIG_CRYPTONIGHT_LITE_AESNI_H */

72
algo/cryptonight-lite/cryptonight_lite_av1_aesni.c → algo/cryptonight-lite/cryptonight_lite_av1.c

@ -27,18 +27,19 @@
#include <string.h>
#include "algo/cryptonight/cryptonight.h"
#include "cryptonight_lite_aesni.h"
#include "algo/cryptonight/cryptonight_monero.h"
#include "crypto/c_keccak.h"
#include "cryptonight_lite_aesni.h"
void cryptonight_lite_av1_aesni(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx, uint8_t version)
void cryptonight_lite_av1_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
{
keccak((const uint8_t *) input, size, ctx->state0, 200);
keccak(input, size, ctx[0]->state, 200);
cn_explode_scratchpad((__m128i*) ctx->state0, (__m128i*) ctx->memory);
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
const uint8_t* l0 = ctx->memory;
uint64_t* h0 = (uint64_t*) ctx->state0;
const uint8_t* l0 = ctx[0]->memory;
uint64_t* h0 = (uint64_t*) ctx[0]->state;
uint64_t al0 = h0[0] ^ h0[4];
uint64_t ah0 = h0[1] ^ h0[5];
@ -71,8 +72,63 @@ void cryptonight_lite_av1_aesni(const void *restrict input, size_t size, void *r
idx0 = al0;
}
cn_implode_scratchpad((__m128i*) ctx->memory, (__m128i*) ctx->state0);
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
keccakf(h0, 24);
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
}
void cryptonight_lite_av1_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
{
if (size < 43) {
memset(output, 0, 32);
return;
}
keccak(input, size, ctx[0]->state, 200);
VARIANT1_INIT(0);
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
const uint8_t* l0 = ctx[0]->memory;
uint64_t* h0 = (uint64_t*) ctx[0]->state;
uint64_t al0 = h0[0] ^ h0[4];
uint64_t ah0 = h0[1] ^ h0[5];
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
uint64_t idx0 = h0[0] ^ h0[4];
for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) {
__m128i cx;
cx = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]);
cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx));
idx0 = EXTRACT64(cx);
bx0 = cx;
uint64_t hi, lo, cl, ch;
cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0];
ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1];
lo = _umul128(idx0, cl, &hi);
al0 += hi;
ah0 += lo;
((uint64_t*)&l0[idx0 & 0xFFFF0])[0] = al0;
((uint64_t*)&l0[idx0 & 0xFFFF0])[1] = ah0 ^ tweak1_2_0;
ah0 ^= ch;
al0 ^= cl;
idx0 = al0;
}
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
keccakf(h0, 24);
extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output);
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
}

108
algo/cryptonight-lite/cryptonight_lite_av2_aesni_double.c → algo/cryptonight-lite/cryptonight_lite_av2.c

@ -27,19 +27,20 @@
#include <string.h>
#include "algo/cryptonight/cryptonight.h"
#include "algo/cryptonight/cryptonight_monero.h"
#include "cryptonight_lite_aesni.h"
#include "crypto/c_keccak.h"
void cryptonight_lite_av2_aesni_double(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx, uint8_t version)
void cryptonight_lite_av2_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
{
keccak((const uint8_t *) input, size, ctx->state0, 200);
keccak((const uint8_t *) input + size, size, ctx->state1, 200);
keccak(input, size, ctx[0]->state, 200);
keccak(input + size, size, ctx[1]->state, 200);
const uint8_t* l0 = ctx->memory;
const uint8_t* l1 = ctx->memory + MEMORY_LITE;
uint64_t* h0 = (uint64_t*) ctx->state0;
uint64_t* h1 = (uint64_t*) ctx->state1;
const uint8_t* l0 = ctx[0]->memory;
const uint8_t* l1 = ctx[1]->memory;
uint64_t* h0 = (uint64_t*) ctx[0]->state;
uint64_t* h1 = (uint64_t*) ctx[1]->state;
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
@ -107,6 +108,95 @@ void cryptonight_lite_av2_aesni_double(const void *restrict input, size_t size,
keccakf(h0, 24);
keccakf(h1, 24);
extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output);
extra_hashes[ctx->state1[0] & 3](ctx->state1, 200, (char*) output + 32);
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, (char*) output + 32);
}
void cryptonight_lite_av2_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
{
if (size < 43) {
memset(output, 0, 64);
return;
}
keccak(input, size, ctx[0]->state, 200);
keccak(input + size, size, ctx[1]->state, 200);
VARIANT1_INIT(0);
VARIANT1_INIT(1);
const uint8_t* l0 = ctx[0]->memory;
const uint8_t* l1 = ctx[1]->memory;
uint64_t* h0 = (uint64_t*) ctx[0]->state;
uint64_t* h1 = (uint64_t*) ctx[1]->state;
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
uint64_t al0 = h0[0] ^ h0[4];
uint64_t al1 = h1[0] ^ h1[4];
uint64_t ah0 = h0[1] ^ h0[5];
uint64_t ah1 = h1[1] ^ h1[5];
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
uint64_t idx0 = h0[0] ^ h0[4];
uint64_t idx1 = h1[0] ^ h1[4];
for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) {
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]);
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0xFFFF0]);
cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx0));
cryptonight_monero_tweak((uint64_t*)&l1[idx1 & 0xFFFF0], _mm_xor_si128(bx1, cx1));
idx0 = EXTRACT64(cx0);
idx1 = EXTRACT64(cx1);
bx0 = cx0;
bx1 = cx1;
uint64_t hi, lo, cl, ch;
cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0];
ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1];
lo = _umul128(idx0, cl, &hi);
al0 += hi;
ah0 += lo;
((uint64_t*) &l0[idx0 & 0xFFFF0])[0] = al0;
((uint64_t*) &l0[idx0 & 0xFFFF0])[1] = ah0 ^ tweak1_2_0;
ah0 ^= ch;
al0 ^= cl;
idx0 = al0;
cl = ((uint64_t*) &l1[idx1 & 0xFFFF0])[0];
ch = ((uint64_t*) &l1[idx1 & 0xFFFF0])[1];
lo = _umul128(idx1, cl, &hi);
al1 += hi;
ah1 += lo;
((uint64_t*) &l1[idx1 & 0xFFFF0])[0] = al1;
((uint64_t*) &l1[idx1 & 0xFFFF0])[1] = ah1 ^ tweak1_2_1;
ah1 ^= ch;
al1 ^= cl;
idx1 = al1;
}
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
keccakf(h0, 24);
keccakf(h1, 24);
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, (char*) output + 32);
}

134
algo/cryptonight-lite/cryptonight_lite_av3.c

@ -0,0 +1,134 @@
/* XMRig
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
* Copyright 2012-2014 pooler <pooler@litecoinpool.org>
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <x86intrin.h>
#include <string.h>
#include "algo/cryptonight/cryptonight.h"
#include "algo/cryptonight/cryptonight_monero.h"
#include "cryptonight_lite_softaes.h"
#include "crypto/c_keccak.h"
void cryptonight_lite_av3_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
{
keccak(input, size, ctx[0]->state, 200);
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
const uint8_t* l0 = ctx[0]->memory;
uint64_t* h0 = (uint64_t*) ctx[0]->state;
uint64_t al0 = h0[0] ^ h0[4];
uint64_t ah0 = h0[1] ^ h0[5];
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
uint64_t idx0 = h0[0] ^ h0[4];
for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) {
__m128i cx;
cx = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]);
cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0));
_mm_store_si128((__m128i *) &l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx));
idx0 = EXTRACT64(cx);
bx0 = cx;
uint64_t hi, lo, cl, ch;
cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0];
ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1];
lo = _umul128(idx0, cl, &hi);
al0 += hi;
ah0 += lo;
((uint64_t*)&l0[idx0 & 0xFFFF0])[0] = al0;
((uint64_t*)&l0[idx0 & 0xFFFF0])[1] = ah0;
ah0 ^= ch;
al0 ^= cl;
idx0 = al0;
}
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
keccakf(h0, 24);
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
}
void cryptonight_lite_av3_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
{
if (size < 43) {
memset(output, 0, 32);
return;
}
keccak(input, size, ctx[0]->state, 200);
VARIANT1_INIT(0);
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
const uint8_t* l0 = ctx[0]->memory;
uint64_t* h0 = (uint64_t*) ctx[0]->state;
uint64_t al0 = h0[0] ^ h0[4];
uint64_t ah0 = h0[1] ^ h0[5];
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
uint64_t idx0 = h0[0] ^ h0[4];
for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) {
__m128i cx;
cx = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]);
cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0));
cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx));
idx0 = EXTRACT64(cx);
bx0 = cx;
uint64_t hi, lo, cl, ch;
cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0];
ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1];
lo = _umul128(idx0, cl, &hi);
al0 += hi;
ah0 += lo;
((uint64_t*)&l0[idx0 & 0xFFFF0])[0] = al0;
((uint64_t*)&l0[idx0 & 0xFFFF0])[1] = ah0 ^ tweak1_2_0;
ah0 ^= ch;
al0 ^= cl;
idx0 = al0;
}
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
keccakf(h0, 24);
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
}

78
algo/cryptonight-lite/cryptonight_lite_av3_softaes.c

@ -1,78 +0,0 @@
/* XMRig
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
* Copyright 2012-2014 pooler <pooler@litecoinpool.org>
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <x86intrin.h>
#include <string.h>
#include "algo/cryptonight/cryptonight.h"
#include "cryptonight_lite_softaes.h"
#include "crypto/c_keccak.h"
void cryptonight_lite_av3_softaes(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx, uint8_t version)
{
keccak((const uint8_t *) input, size, ctx->state0, 200);
cn_explode_scratchpad((__m128i*) ctx->state0, (__m128i*) ctx->memory);
const uint8_t* l0 = ctx->memory;
uint64_t* h0 = (uint64_t*) ctx->state0;
uint64_t al0 = h0[0] ^ h0[4];
uint64_t ah0 = h0[1] ^ h0[5];
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
uint64_t idx0 = h0[0] ^ h0[4];
for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) {
__m128i cx;
cx = _mm_load_si128((__m128i *)&l0[idx0 & 0xFFFF0]);
cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0));
_mm_store_si128((__m128i *)&l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx));
idx0 = EXTRACT64(cx);
bx0 = cx;
uint64_t hi, lo, cl, ch;
cl = ((uint64_t*)&l0[idx0 & 0xFFFF0])[0];
ch = ((uint64_t*)&l0[idx0 & 0xFFFF0])[1];
lo = _umul128(idx0, cl, &hi);
al0 += hi;
ah0 += lo;
((uint64_t*)&l0[idx0 & 0xFFFF0])[0] = al0;
((uint64_t*)&l0[idx0 & 0xFFFF0])[1] = ah0;
ah0 ^= ch;
al0 ^= cl;
idx0 = al0;
}
cn_implode_scratchpad((__m128i*) ctx->memory, (__m128i*) ctx->state0);
keccakf(h0, 24);
extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output);
}

108
algo/cryptonight-lite/cryptonight_lite_av4_softaes_double.c → algo/cryptonight-lite/cryptonight_lite_av4.c

@ -27,19 +27,20 @@
#include <string.h>
#include "algo/cryptonight/cryptonight.h"
#include "algo/cryptonight/cryptonight_monero.h"
#include "cryptonight_lite_softaes.h"
#include "crypto/c_keccak.h"
void cryptonight_lite_av4_softaes_double(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx, uint8_t version)
void cryptonight_lite_av4_v0(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx **restrict ctx)
{
keccak((const uint8_t *) input, size, ctx->state0, 200);
keccak((const uint8_t *) input + size, size, ctx->state1, 200);
keccak(input, size, ctx[0]->state, 200);
keccak(input + size, size, ctx[1]->state, 200);
const uint8_t* l0 = ctx->memory;
const uint8_t* l1 = ctx->memory + MEMORY_LITE;
uint64_t* h0 = (uint64_t*) ctx->state0;
uint64_t* h1 = (uint64_t*) ctx->state1;
const uint8_t* l0 = ctx[0]->memory;
const uint8_t* l1 = ctx[1]->memory;
uint64_t* h0 = (uint64_t*) ctx[0]->state;
uint64_t* h1 = (uint64_t*) ctx[1]->state;
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
@ -107,6 +108,95 @@ void cryptonight_lite_av4_softaes_double(const void *restrict input, size_t size
keccakf(h0, 24);
keccakf(h1, 24);
extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output);
extra_hashes[ctx->state1[0] & 3](ctx->state1, 200, (char*) output + 32);
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
}
void cryptonight_lite_av4_v1(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx **restrict ctx)
{
if (size < 43) {
memset(output, 0, 64);
return;
}
keccak(input, size, ctx[0]->state, 200);
keccak(input + size, size, ctx[1]->state, 200);
VARIANT1_INIT(0);
VARIANT1_INIT(1);
const uint8_t* l0 = ctx[0]->memory;
const uint8_t* l1 = ctx[1]->memory;
uint64_t* h0 = (uint64_t*) ctx[0]->state;
uint64_t* h1 = (uint64_t*) ctx[1]->state;
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
uint64_t al0 = h0[0] ^ h0[4];
uint64_t al1 = h1[0] ^ h1[4];
uint64_t ah0 = h0[1] ^ h0[5];
uint64_t ah1 = h1[1] ^ h1[5];
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
uint64_t idx0 = h0[0] ^ h0[4];
uint64_t idx1 = h1[0] ^ h1[4];
for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) {
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]);
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0xFFFF0]);
cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0));
cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1));
cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx0));
cryptonight_monero_tweak((uint64_t*)&l1[idx1 & 0xFFFF0], _mm_xor_si128(bx1, cx1));
idx0 = EXTRACT64(cx0);
idx1 = EXTRACT64(cx1);
bx0 = cx0;
bx1 = cx1;
uint64_t hi, lo, cl, ch;
cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0];
ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1];
lo = _umul128(idx0, cl, &hi);
al0 += hi;
ah0 += lo;
((uint64_t*) &l0[idx0 & 0xFFFF0])[0] = al0;
((uint64_t*) &l0[idx0 & 0xFFFF0])[1] = ah0 ^ tweak1_2_0;
ah0 ^= ch;
al0 ^= cl;
idx0 = al0;
cl = ((uint64_t*) &l1[idx1 & 0xFFFF0])[0];
ch = ((uint64_t*) &l1[idx1 & 0xFFFF0])[1];
lo = _umul128(idx1, cl, &hi);
al1 += hi;
ah1 += lo;
((uint64_t*) &l1[idx1 & 0xFFFF0])[0] = al1;
((uint64_t*) &l1[idx1 & 0xFFFF0])[1] = ah1 ^ tweak1_2_1;
ah1 ^= ch;
al1 ^= cl;
idx1 = al1;
}
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
keccakf(h0, 24);
keccakf(h1, 24);
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, (char*) output + 32);
}

19
algo/cryptonight-lite/cryptonight_lite_softaes.h

@ -25,7 +25,10 @@
#ifndef __CRYPTONIGHT_LITE_SOFTAES_H__
#define __CRYPTONIGHT_LITE_SOFTAES_H__
#include <x86intrin.h>
#include <stdint.h>
extern __m128i soft_aesenc(__m128i in, __m128i key);
extern __m128i soft_aeskeygenassist(__m128i key, uint8_t rcon);
@ -234,4 +237,20 @@ static inline uint64_t _umul128(uint64_t multiplier, uint64_t multiplicand, uint
#endif
static inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
{
mem_out[0] = EXTRACT64(tmp);
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
uint64_t vh = EXTRACT64(tmp);
uint8_t x = vh >> 24;
static const uint16_t table = 0x7531;
const uint8_t index = (((x >> 3) & 6) | (x & 1)) << 1;
vh ^= ((table >> index) & 0x3) << 28;
mem_out[1] = vh;
}
#endif /* __CRYPTONIGHT_LITE_SOFTAES_H__ */

264
algo/cryptonight/cryptonight.c

@ -23,14 +23,17 @@
*/
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <mm_malloc.h>
#ifndef BUILD_TEST
# include "xmrig.h"
#endif
#include "cpu.h"
#include "crypto/c_blake256.h"
#include "crypto/c_groestl.h"
#include "crypto/c_jh.h"
@ -40,112 +43,192 @@
#include "options.h"
void cryptonight_av1_aesni(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx, uint8_t version);
void cryptonight_av2_aesni_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx, uint8_t version);
void cryptonight_av3_softaes(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx, uint8_t version);
void cryptonight_av4_softaes_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx, uint8_t version);
void cryptonight_av1_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
void cryptonight_av1_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
void cryptonight_av1_v2(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
void cryptonight_av2_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
void cryptonight_av2_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
void cryptonight_av2_v2(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
void cryptonight_av3_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
void cryptonight_av3_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
void cryptonight_av3_v2(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
void cryptonight_av4_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
void cryptonight_av4_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
void cryptonight_av4_v2(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
#ifndef XMRIG_NO_AEON
void cryptonight_lite_av1_aesni(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx, uint8_t);
void cryptonight_lite_av2_aesni_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx, uint8_t);
void cryptonight_lite_av3_softaes(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx, uint8_t);
void cryptonight_lite_av4_softaes_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx, uint8_t);
void cryptonight_lite_av1_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
void cryptonight_lite_av1_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
void cryptonight_lite_av2_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
void cryptonight_lite_av2_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
void cryptonight_lite_av3_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
void cryptonight_lite_av3_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
void cryptonight_lite_av4_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
void cryptonight_lite_av4_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
#endif
void (*cryptonight_hash_ctx)(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx, uint8_t version) = NULL;
#ifndef XMRIG_NO_ASM
void cryptonight_single_hash_asm_intel(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
void cryptonight_single_hash_asm_ryzen(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
void cryptonight_double_hash_asm(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
#endif
static bool self_test() {
if (cryptonight_hash_ctx == NULL) {
static inline bool verify(enum Variant variant, uint8_t *output, struct cryptonight_ctx **ctx, const uint8_t *referenceValue)
{
cn_hash_fun func = cryptonight_hash_fn(opt_algo, opt_av, variant);
if (func == NULL) {
return false;
}
char output[64];
func(test_input, 76, output, ctx);
struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) _mm_malloc(sizeof(struct cryptonight_ctx), 16);
ctx->memory = (uint8_t *) _mm_malloc(MEMORY * 2, 16);
return memcmp(output, referenceValue, opt_double_hash ? 64 : 32) == 0;
}
cryptonight_hash_ctx(test_input, 76, output, ctx, 0);
# ifndef XMRIG_NO_AEON
bool rc = memcmp(output, opt_algo == ALGO_CRYPTONIGHT_LITE ? test_output1 : test_output0, (opt_double_hash ? 64 : 32)) == 0;
# else
bool rc = memcmp(output, test_output0, opt_double_hash ? 64 : 32)) == 0;
# endif
static bool self_test() {
struct cryptonight_ctx *ctx[2];
uint8_t output[64];
if (rc && opt_algo == ALGO_CRYPTONIGHT) {
cryptonight_hash_ctx(test_input, 76, output, ctx, 7);
const size_t count = opt_double_hash ? 2 : 1;
const size_t size = opt_algo == ALGO_CRYPTONIGHT ? MEMORY : MEMORY_LITE;
bool result = false;
rc = memcmp(output, test_output2, (opt_double_hash ? 64 : 32)) == 0;
for (int i = 0; i < count; ++i) {
ctx[i] = _mm_malloc(sizeof(struct cryptonight_ctx), 16);
ctx[i]->memory = _mm_malloc(size, 16);
}
_mm_free(ctx->memory);
_mm_free(ctx);
if (opt_algo == ALGO_CRYPTONIGHT) {
result = verify(VARIANT_0, output, ctx, test_output_v0) &&
verify(VARIANT_1, output, ctx, test_output_v1) &&
verify(VARIANT_2, output, ctx, test_output_v2);
}
# ifndef XMRIG_NO_AEON
else {
result = verify(VARIANT_0, output, ctx, test_output_v0_lite) &&
verify(VARIANT_1, output, ctx, test_output_v1_lite);
}
# endif
return rc;
}
for (int i = 0; i < count; ++i) {
_mm_free(ctx[i]->memory);
_mm_free(ctx[i]);
}
#ifndef XMRIG_NO_AEON
bool cryptonight_lite_init(int variant) {
switch (variant) {
case AEON_AV1_AESNI:
cryptonight_hash_ctx = cryptonight_lite_av1_aesni;
break;
return result;
}
case AEON_AV2_AESNI_DOUBLE:
opt_double_hash = true;
cryptonight_hash_ctx = cryptonight_lite_av2_aesni_double;
break;
case AEON_AV3_SOFT_AES:
cryptonight_hash_ctx = cryptonight_lite_av3_softaes;
break;
size_t fn_index(enum Algo algorithm, enum AlgoVariant av, enum Variant variant, enum Assembly assembly)
{
const size_t index = VARIANT_MAX * 4 * algorithm + 4 * variant + av - 1;
case AEON_AV4_SOFT_AES_DOUBLE:
opt_double_hash = true;
cryptonight_hash_ctx = cryptonight_lite_av4_softaes_double;
break;
# ifndef XMRIG_NO_ASM
if (assembly == ASM_AUTO) {
assembly = cpu_info.assembly;
}
default:
break;
if (assembly == ASM_NONE) {
return index;
}
return self_test();
}
#endif
const size_t offset = VARIANT_MAX * 4 * 2;
if (algorithm == ALGO_CRYPTONIGHT && variant == VARIANT_2) {
if (av == AV_SINGLE) {
return offset + assembly - 2;
}
bool cryptonight_init(int variant)
{
# ifndef XMRIG_NO_AEON
if (opt_algo == ALGO_CRYPTONIGHT_LITE) {
return cryptonight_lite_init(variant);
if (av == AV_DOUBLE) {
return offset + 2;
}
}
# endif
switch (variant) {
case XMR_AV1_AESNI:
cryptonight_hash_ctx = cryptonight_av1_aesni;
break;
return index;
}
case XMR_AV2_AESNI_DOUBLE:
opt_double_hash = true;
cryptonight_hash_ctx = cryptonight_av2_aesni_double;
break;
case XMR_AV3_SOFT_AES:
cryptonight_hash_ctx = cryptonight_av3_softaes;
break;
cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum Variant variant)
{
assert(av > AV_AUTO && av < AV_MAX);
assert(variant > VARIANT_AUTO && variant < VARIANT_MAX);
case XMR_AV4_SOFT_AES_DOUBLE:
opt_double_hash = true;
cryptonight_hash_ctx = cryptonight_av4_softaes_double;
break;
# ifndef XMRIG_NO_ASM
static const cn_hash_fun func_table[VARIANT_MAX * 4 * 2 + 3] = {
# else
static const cn_hash_fun func_table[VARIANT_MAX * 4 * 2] = {
# endif
cryptonight_av1_v0,
cryptonight_av2_v0,
cryptonight_av3_v0,
cryptonight_av4_v0,
cryptonight_av1_v1,
cryptonight_av2_v1,
cryptonight_av3_v1,
cryptonight_av4_v1,
cryptonight_av1_v2,
cryptonight_av2_v2,
cryptonight_av3_v2,
cryptonight_av4_v2,
# ifndef XMRIG_NO_AEON
cryptonight_lite_av1_v0,
cryptonight_lite_av2_v0,
cryptonight_lite_av3_v0,
cryptonight_lite_av4_v0,
cryptonight_lite_av1_v1,
cryptonight_lite_av2_v1,
cryptonight_lite_av3_v1,
cryptonight_lite_av4_v1,
NULL,
NULL,
NULL,
NULL,
# else
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
# endif
# ifndef XMRIG_NO_ASM
cryptonight_single_hash_asm_intel,
cryptonight_single_hash_asm_ryzen,
cryptonight_double_hash_asm
# endif
};
# ifndef NDEBUG
const size_t index = fn_index(algorithm, av, variant, opt_assembly);
cn_hash_fun func = func_table[index];
assert(index < sizeof(func_table) / sizeof(func_table[0]));
assert(func != NULL);
return func;
# else
return func_table[fn_index(algorithm, av, variant, opt_assembly)];
# endif
}
default:
break;
}
bool cryptonight_init(int av)
{
opt_double_hash = av == AV_DOUBLE || av == AV_DOUBLE_SOFT;
return self_test();
}
@ -174,12 +257,32 @@ static inline void do_skein_hash(const void* input, size_t len, char* output) {
void (* const extra_hashes[4])(const void *, size_t, char *) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash};
static inline enum Variant cryptonight_variant(uint8_t version)
{
if (opt_variant != VARIANT_AUTO) {
return opt_variant;
}
if (opt_algo == ALGO_CRYPTONIGHT_LITE) {
return VARIANT_1;
}
if (version >= 8) {
return VARIANT_2;
}
return version == 7 ? VARIANT_1 : VARIANT_0;
}
#ifndef BUILD_TEST
int scanhash_cryptonight(int thr_id, uint32_t *hash, uint32_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx *restrict ctx) {
uint32_t *nonceptr = (uint32_t*) (((char*) blob) + 39);
int scanhash_cryptonight(int thr_id, uint32_t *hash, const uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx) {
uint32_t *nonceptr = (uint32_t*) (((char*) blob) + 39);
enum Variant variant = cryptonight_variant(blob[0]);
do {
cryptonight_hash_ctx(blob, blob_size, hash, ctx, ((uint8_t*) blob)[0]);
cryptonight_hash_fn(opt_algo, opt_av, variant)(blob, blob_size, (uint8_t *) hash, ctx);
(*hashes_done)++;
if (unlikely(hash[7] < target)) {
@ -193,13 +296,14 @@ int scanhash_cryptonight(int thr_id, uint32_t *hash, uint32_t *restrict blob, si
}
int scanhash_cryptonight_double(int thr_id, uint32_t *hash, uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx *restrict ctx) {
int rc = 0;
uint32_t *nonceptr0 = (uint32_t*) (((char*) blob) + 39);
uint32_t *nonceptr1 = (uint32_t*) (((char*) blob) + 39 + blob_size);
int scanhash_cryptonight_double(int thr_id, uint32_t *hash, const uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx) {
int rc = 0;
uint32_t *nonceptr0 = (uint32_t*) (((char*) blob) + 39);
uint32_t *nonceptr1 = (uint32_t*) (((char*) blob) + 39 + blob_size);
enum Variant variant = cryptonight_variant(blob[0]);
do {
cryptonight_hash_ctx(blob, blob_size, hash, ctx, ((uint8_t*) blob)[0]);
cryptonight_hash_fn(opt_algo, opt_av, variant)(blob, blob_size, (uint8_t *) hash, ctx);
(*hashes_done) += 2;
if (unlikely(hash[7] < target)) {

28
algo/cryptonight/cryptonight.h

@ -22,27 +22,37 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __CRYPTONIGHT_H__
#define __CRYPTONIGHT_H__
#ifndef XMRIG_CRYPTONIGHT_H
#define XMRIG_CRYPTONIGHT_H
#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>
#include "options.h"
#define MEMORY 2097152 /* 2 MiB */
#define MEMORY_LITE 1048576 /* 1 MiB */
struct cryptonight_ctx {
uint8_t state0[200] __attribute__((aligned(16)));
uint8_t state1[200] __attribute__((aligned(16)));
uint8_t* memory __attribute__((aligned(16)));
uint8_t state[224] __attribute__((aligned(16)));
uint8_t* memory __attribute__((aligned(16)));
};
typedef void (*cn_hash_fun)(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
extern void (* const extra_hashes[4])(const void *, size_t, char *);
bool cryptonight_init(int variant);
int scanhash_cryptonight(int thr_id, uint32_t *hash, uint32_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx *restrict ctx);
int scanhash_cryptonight_double(int thr_id, uint32_t *hash, uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx *restrict ctx);
cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum Variant variant);
bool cryptonight_init(int av);
int scanhash_cryptonight(int thr_id, uint32_t *hash, const uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx);
int scanhash_cryptonight_double(int thr_id, uint32_t *hash, const uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx);
#endif /* __CRYPTONIGHT_H__ */
#endif /* XMRIG_CRYPTONIGHT_H */

24
algo/cryptonight/cryptonight_aesni.h

@ -22,10 +22,12 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __CRYPTONIGHT_AESNI_H__
#define __CRYPTONIGHT_AESNI_H__
#ifndef XMRIG_CRYPTONIGHT_AESNI_H
#define XMRIG_CRYPTONIGHT_AESNI_H
#include <x86intrin.h>
#include <stdint.h>
#define aes_genkey_sub(imm8) \
@ -253,4 +255,20 @@ static inline uint64_t _umul128(uint64_t multiplier, uint64_t multiplicand, uint
#endif
#endif /* __CRYPTONIGHT_AESNI_H__ */
static inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
{
mem_out[0] = EXTRACT64(tmp);
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
uint64_t vh = EXTRACT64(tmp);
uint8_t x = vh >> 24;
static const uint16_t table = 0x7531;
const uint8_t index = (((x >> 3) & 6) | (x & 1)) << 1;
vh ^= ((table >> index) & 0x3) << 28;
mem_out[1] = vh;
}
#endif /* XMRIG_CRYPTONIGHT_AESNI_H */

247
algo/cryptonight/cryptonight_av1.c

@ -0,0 +1,247 @@
/* XMRig
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
* Copyright 2012-2014 pooler <pooler@litecoinpool.org>
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <x86intrin.h>
#include <string.h>
#include "crypto/c_keccak.h"
#include "cryptonight.h"
#include "cryptonight_aesni.h"
#include "cryptonight_monero.h"
void cryptonight_av1_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
{
keccak(input, size, ctx[0]->state, 200);
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
const uint8_t* l0 = ctx[0]->memory;
uint64_t* h0 = (uint64_t*) ctx[0]->state;
uint64_t al0 = h0[0] ^ h0[4];
uint64_t ah0 = h0[1] ^ h0[5];
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
uint64_t idx0 = h0[0] ^ h0[4];
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
__m128i cx;
cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
idx0 = EXTRACT64(cx);
bx0 = cx;
uint64_t hi, lo, cl, ch;
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
lo = _umul128(idx0, cl, &hi);
al0 += hi;
ah0 += lo;
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
ah0 ^= ch;
al0 ^= cl;
idx0 = al0;
}
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
keccakf(h0, 24);
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
}
void cryptonight_av1_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
{
if (size < 43) {
memset(output, 0, 32);
return;
}
keccak(input, size, ctx[0]->state, 200);
VARIANT1_INIT(0);
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
const uint8_t* l0 = ctx[0]->memory;
uint64_t* h0 = (uint64_t*) ctx[0]->state;
uint64_t al0 = h0[0] ^ h0[4];
uint64_t ah0 = h0[1] ^ h0[5];
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
uint64_t idx0 = h0[0] ^ h0[4];
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
__m128i cx;
cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
idx0 = EXTRACT64(cx);
bx0 = cx;
uint64_t hi, lo, cl, ch;
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
lo = _umul128(idx0, cl, &hi);
al0 += hi;
ah0 += lo;
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0 ^ tweak1_2_0;
ah0 ^= ch;
al0 ^= cl;
idx0 = al0;
}
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
keccakf(h0, 24);
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
}
void cryptonight_av1_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
{
keccak(input, size, ctx[0]->state, 200);
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
const uint8_t* l0 = ctx[0]->memory;
uint64_t* h0 = (uint64_t*) ctx[0]->state;
VARIANT2_INIT(0);
VARIANT2_SET_ROUNDING_MODE();
uint64_t al0 = h0[0] ^ h0[4];
uint64_t ah0 = h0[1] ^ h0[5];
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
__m128i bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
uint64_t idx0 = al0;
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
__m128i cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
const __m128i ax0 = _mm_set_epi64x(ah0, al0);
cx = _mm_aesenc_si128(cx, ax0);
VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1);
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
idx0 = _mm_cvtsi128_si64(cx);
uint64_t hi, lo, cl, ch;
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
VARIANT2_INTEGER_MATH(0, cl, cx);
lo = _umul128(idx0, cl, &hi);
VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, hi, lo);
al0 += hi;
ah0 += lo;
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
al0 ^= cl;
ah0 ^= ch;
idx0 = al0;
bx1 = bx0;
bx0 = cx;
}
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
keccakf(h0, 24);
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
}
#ifndef XMRIG_NO_ASM
extern void cnv2_mainloop_ivybridge_asm(struct cryptonight_ctx *ctx);
extern void cnv2_mainloop_ryzen_asm(struct cryptonight_ctx *ctx);
extern void cnv2_double_mainloop_sandybridge_asm(struct cryptonight_ctx* ctx0, struct cryptonight_ctx* ctx1);
void cryptonight_single_hash_asm_intel(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
{
keccak(input, size, ctx[0]->state, 200);
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
cnv2_mainloop_ivybridge_asm(ctx[0]);
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
keccakf((uint64_t*) ctx[0]->state, 24);
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
}
void cryptonight_single_hash_asm_ryzen(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
{
keccak(input, size, ctx[0]->state, 200);
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
cnv2_mainloop_ryzen_asm(ctx[0]);
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
keccakf((uint64_t*) ctx[0]->state, 24);
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
}
void cryptonight_double_hash_asm(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
{
keccak(input, size, ctx[0]->state, 200);
keccak(input + size, size, ctx[1]->state, 200);
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
cn_explode_scratchpad((__m128i*) ctx[1]->state, (__m128i*) ctx[1]->memory);
cnv2_double_mainloop_sandybridge_asm(ctx[0], ctx[1]);
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
cn_implode_scratchpad((__m128i*) ctx[1]->memory, (__m128i*) ctx[1]->state);
keccakf((uint64_t*) ctx[0]->state, 24);
keccakf((uint64_t*) ctx[1]->state, 24);
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
}
#endif

84
algo/cryptonight/cryptonight_av1_aesni.c

@ -1,84 +0,0 @@
/* XMRig
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
* Copyright 2012-2014 pooler <pooler@litecoinpool.org>
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <x86intrin.h>
#include <string.h>
#include "crypto/c_keccak.h"
#include "cryptonight.h"
#include "cryptonight_aesni.h"
#include "cryptonight_monero.h"
void cryptonight_av1_aesni(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx, uint8_t version)
{
keccak((const uint8_t *) input, size, ctx->state0, 200);
VARIANT1_INIT(0);
cn_explode_scratchpad((__m128i*) ctx->state0, (__m128i*) ctx->memory);
const uint8_t* l0 = ctx->memory;
uint64_t* h0 = (uint64_t*) ctx->state0;
uint64_t al0 = h0[0] ^ h0[4];
uint64_t ah0 = h0[1] ^ h0[5];
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
uint64_t idx0 = h0[0] ^ h0[4];
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
__m128i cx;
cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
VARIANT1_1(&l0[idx0 & 0x1FFFF0]);
idx0 = EXTRACT64(cx);
bx0 = cx;
uint64_t hi, lo, cl, ch;
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
lo = _umul128(idx0, cl, &hi);
al0 += hi;
ah0 += lo;
VARIANT1_2(ah0, 0);
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
VARIANT1_2(ah0, 0);
ah0 ^= ch;
al0 ^= cl;
idx0 = al0;
}
cn_implode_scratchpad((__m128i*) ctx->memory, (__m128i*) ctx->state0);
keccakf(h0, 24);
extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output);
}

304
algo/cryptonight/cryptonight_av2.c

@ -0,0 +1,304 @@
/* XMRig
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
* Copyright 2012-2014 pooler <pooler@litecoinpool.org>
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <x86intrin.h>
#include <string.h>
#include "crypto/c_keccak.h"
#include "cryptonight.h"
#include "cryptonight_aesni.h"
#include "cryptonight_monero.h"
void cryptonight_av2_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
{
keccak(input, size, ctx[0]->state, 200);
keccak(input + size, size, ctx[1]->state, 200);
const uint8_t* l0 = ctx[0]->memory;
const uint8_t* l1 = ctx[1]->memory;
uint64_t* h0 = (uint64_t*) ctx[0]->state;
uint64_t* h1 = (uint64_t*) ctx[1]->state;
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
uint64_t al0 = h0[0] ^ h0[4];
uint64_t al1 = h1[0] ^ h1[4];
uint64_t ah0 = h0[1] ^ h0[5];
uint64_t ah1 = h1[1] ^ h1[5];
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
uint64_t idx0 = h0[0] ^ h0[4];
uint64_t idx1 = h1[0] ^ h1[4];
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0));
_mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1));
idx0 = EXTRACT64(cx0);
idx1 = EXTRACT64(cx1);
bx0 = cx0;
bx1 = cx1;
uint64_t hi, lo, cl, ch;
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
lo = _umul128(idx0, cl, &hi);
al0 += hi;
ah0 += lo;
((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0;
((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0;
ah0 ^= ch;
al0 ^= cl;
idx0 = al0;
cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
lo = _umul128(idx1, cl, &hi);
al1 += hi;
ah1 += lo;
((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1;
((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1;
ah1 ^= ch;
al1 ^= cl;
idx1 = al1;
}
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
keccakf(h0, 24);
keccakf(h1, 24);
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
}
void cryptonight_av2_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
{
if (size < 43) {
memset(output, 0, 64);
return;
}
keccak(input, size, ctx[0]->state, 200);
keccak(input + size, size, ctx[1]->state, 200);
VARIANT1_INIT(0);
VARIANT1_INIT(1);
const uint8_t* l0 = ctx[0]->memory;
const uint8_t* l1 = ctx[1]->memory;
uint64_t* h0 = (uint64_t*) ctx[0]->state;
uint64_t* h1 = (uint64_t*) ctx[1]->state;
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
uint64_t al0 = h0[0] ^ h0[4];
uint64_t al1 = h1[0] ^ h1[4];
uint64_t ah0 = h0[1] ^ h0[5];
uint64_t ah1 = h1[1] ^ h1[5];
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
uint64_t idx0 = h0[0] ^ h0[4];
uint64_t idx1 = h1[0] ^ h1[4];
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0));
cryptonight_monero_tweak((uint64_t*)&l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1));
idx0 = EXTRACT64(cx0);
idx1 = EXTRACT64(cx1);
bx0 = cx0;
bx1 = cx1;
uint64_t hi, lo, cl, ch;
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
lo = _umul128(idx0, cl, &hi);
al0 += hi;
ah0 += lo;
((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0;
((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0 ^ tweak1_2_0;
ah0 ^= ch;
al0 ^= cl;
idx0 = al0;
cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
lo = _umul128(idx1, cl, &hi);
al1 += hi;
ah1 += lo;
((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1;
((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1 ^ tweak1_2_1;
ah1 ^= ch;
al1 ^= cl;
idx1 = al1;
}
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
keccakf(h0, 24);
keccakf(h1, 24);
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
}
void cryptonight_av2_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
{
keccak(input, size, ctx[0]->state, 200);
keccak(input + size, size, ctx[1]->state, 200);
const uint8_t* l0 = ctx[0]->memory;
const uint8_t* l1 = ctx[1]->memory;
uint64_t* h0 = (uint64_t*) ctx[0]->state;
uint64_t* h1 = (uint64_t*) ctx[1]->state;
VARIANT2_INIT(0);
VARIANT2_INIT(1);
VARIANT2_SET_ROUNDING_MODE();
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
uint64_t al0 = h0[0] ^ h0[4];
uint64_t al1 = h1[0] ^ h1[4];
uint64_t ah0 = h0[1] ^ h0[5];
uint64_t ah1 = h1[1] ^ h1[5];
__m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
__m128i bx01 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
__m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
__m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]);
uint64_t idx0 = al0;
uint64_t idx1 = al1;
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
const __m128i ax0 = _mm_set_epi64x(ah0, al0);
const __m128i ax1 = _mm_set_epi64x(ah1, al1);
cx0 = _mm_aesenc_si128(cx0, ax0);
cx1 = _mm_aesenc_si128(cx1, ax1);
VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01);
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx00, cx0));
VARIANT2_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11);
_mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx10, cx1));
idx0 = _mm_cvtsi128_si64(cx0);
idx1 = _mm_cvtsi128_si64(cx1);
uint64_t hi, lo, cl, ch;
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
VARIANT2_INTEGER_MATH(0, cl, cx0);
lo = _umul128(idx0, cl, &hi);
VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, hi, lo);
al0 += hi;
ah0 += lo;
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
al0 ^= cl;
ah0 ^= ch;
idx0 = al0;
cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
VARIANT2_INTEGER_MATH(1, cl, cx1);
lo = _umul128(idx1, cl, &hi);
VARIANT2_SHUFFLE2(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, hi, lo);
al1 += hi;
ah1 += lo;
((uint64_t*)&l1[idx1 & 0x1FFFF0])[0] = al1;
((uint64_t*)&l1[idx1 & 0x1FFFF0])[1] = ah1;
al1 ^= cl;
ah1 ^= ch;
idx1 = al1;
bx01 = bx00;
bx11 = bx10;
bx00 = cx0;
bx10 = cx1;
}
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
keccakf(h0, 24);
keccakf(h1, 24);
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
}

123
algo/cryptonight/cryptonight_av2_aesni_double.c

@ -1,123 +0,0 @@
/* XMRig
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
* Copyright 2012-2014 pooler <pooler@litecoinpool.org>
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <x86intrin.h>
#include <string.h>
#include "crypto/c_keccak.h"
#include "cryptonight.h"
#include "cryptonight_aesni.h"
#include "cryptonight_monero.h"
void cryptonight_av2_aesni_double(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx, uint8_t version)
{
keccak((const uint8_t *) input, size, ctx->state0, 200);
keccak((const uint8_t *) input + size, size, ctx->state1, 200);
VARIANT1_INIT(0);
VARIANT1_INIT(1);
const uint8_t* l0 = ctx->memory;
const uint8_t* l1 = ctx->memory + MEMORY;
uint64_t* h0 = (uint64_t*) ctx->state0;
uint64_t* h1 = (uint64_t*) ctx->state1;
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
uint64_t al0 = h0[0] ^ h0[4];
uint64_t al1 = h1[0] ^ h1[4];
uint64_t ah0 = h0[1] ^ h0[5];
uint64_t ah1 = h1[1] ^ h1[5];
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
uint64_t idx0 = h0[0] ^ h0[4];
uint64_t idx1 = h1[0] ^ h1[4];
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0));
_mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1));
VARIANT1_1(&l0[idx0 & 0x1FFFF0]);
VARIANT1_1(&l1[idx1 & 0x1FFFF0]);
idx0 = EXTRACT64(cx0);
idx1 = EXTRACT64(cx1);
bx0 = cx0;
bx1 = cx1;
uint64_t hi, lo, cl, ch;
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
lo = _umul128(idx0, cl, &hi);
al0 += hi;
ah0 += lo;
VARIANT1_2(ah0, 0);
((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0;
((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0;
VARIANT1_2(ah0, 0);
ah0 ^= ch;
al0 ^= cl;
idx0 = al0;
cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
lo = _umul128(idx1, cl, &hi);
al1 += hi;
ah1 += lo;
VARIANT1_2(ah1, 1);
((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1;
((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1;
VARIANT1_2(ah1, 1);
ah1 ^= ch;
al1 ^= cl;
idx1 = al1;
}
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
keccakf(h0, 24);
keccakf(h1, 24);
extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output);
extra_hashes[ctx->state1[0] & 3](ctx->state1, 200, (char*) output + 32);
}

193
algo/cryptonight/cryptonight_av3.c

@ -0,0 +1,193 @@
/* XMRig
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
* Copyright 2012-2014 pooler <pooler@litecoinpool.org>
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <x86intrin.h>
#include <string.h>
#include "crypto/c_keccak.h"
#include "cryptonight.h"
#include "cryptonight_monero.h"
#include "cryptonight_softaes.h"
void cryptonight_av3_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
{
keccak(input, size, ctx[0]->state, 200);
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
const uint8_t* l0 = ctx[0]->memory;
uint64_t* h0 = (uint64_t*) ctx[0]->state;
uint64_t al0 = h0[0] ^ h0[4];
uint64_t ah0 = h0[1] ^ h0[5];
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
uint64_t idx0 = h0[0] ^ h0[4];
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
__m128i cx;
cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0));
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
idx0 = EXTRACT64(cx);
bx0 = cx;
uint64_t hi, lo, cl, ch;
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
lo = _umul128(idx0, cl, &hi);
al0 += hi;
ah0 += lo;
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
ah0 ^= ch;
al0 ^= cl;
idx0 = al0;
}
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
keccakf(h0, 24);
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
}
void cryptonight_av3_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
{
if (size < 43) {
memset(output, 0, 32);
return;
}
keccak(input, size, ctx[0]->state, 200);
VARIANT1_INIT(0);
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
const uint8_t* l0 = ctx[0]->memory;
uint64_t* h0 = (uint64_t*) ctx[0]->state;
uint64_t al0 = h0[0] ^ h0[4];
uint64_t ah0 = h0[1] ^ h0[5];
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
uint64_t idx0 = h0[0] ^ h0[4];
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
__m128i cx;
cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0));
cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
idx0 = EXTRACT64(cx);
bx0 = cx;
uint64_t hi, lo, cl, ch;
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
lo = _umul128(idx0, cl, &hi);
al0 += hi;
ah0 += lo;
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0 ^ tweak1_2_0;
ah0 ^= ch;
al0 ^= cl;
idx0 = al0;
}
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
keccakf(h0, 24);
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
}
void cryptonight_av3_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
{
keccak(input, size, ctx[0]->state, 200);
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
const uint8_t* l0 = ctx[0]->memory;
uint64_t* h0 = (uint64_t*) ctx[0]->state;
VARIANT2_INIT(0);
VARIANT2_SET_ROUNDING_MODE();
uint64_t al0 = h0[0] ^ h0[4];
uint64_t ah0 = h0[1] ^ h0[5];
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
__m128i bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
uint64_t idx0 = al0;
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
__m128i cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
const __m128i ax0 = _mm_set_epi64x(ah0, al0);
cx = soft_aesenc(cx, ax0);
VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1);
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
idx0 = _mm_cvtsi128_si64(cx);
uint64_t hi, lo, cl, ch;
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
VARIANT2_INTEGER_MATH(0, cl, cx);
lo = _umul128(idx0, cl, &hi);
VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, hi, lo);
al0 += hi;
ah0 += lo;
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
al0 ^= cl;
ah0 ^= ch;
idx0 = al0;
bx1 = bx0;
bx0 = cx;
}
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
keccakf(h0, 24);
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
}

84
algo/cryptonight/cryptonight_av3_softaes.c

@ -1,84 +0,0 @@
/* XMRig
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
* Copyright 2012-2014 pooler <pooler@litecoinpool.org>
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <x86intrin.h>
#include <string.h>
#include "crypto/c_keccak.h"
#include "cryptonight.h"
#include "cryptonight_monero.h"
#include "cryptonight_softaes.h"
void cryptonight_av3_softaes(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx, uint8_t version)
{
keccak((const uint8_t *) input, size, ctx->state0, 200);
VARIANT1_INIT(0);
cn_explode_scratchpad((__m128i*) ctx->state0, (__m128i*) ctx->memory);
const uint8_t* l0 = ctx->memory;
uint64_t* h0 = (uint64_t*) ctx->state0;
uint64_t al0 = h0[0] ^ h0[4];
uint64_t ah0 = h0[1] ^ h0[5];
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
uint64_t idx0 = h0[0] ^ h0[4];
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
__m128i cx;
cx = _mm_load_si128((__m128i *)&l0[idx0 & 0x1FFFF0]);
cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0));
_mm_store_si128((__m128i *)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx));
VARIANT1_1(&l0[idx0 & 0x1FFFF0]);
idx0 = EXTRACT64(cx);
bx0 = cx;
uint64_t hi, lo, cl, ch;
cl = ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0];
ch = ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1];
lo = _umul128(idx0, cl, &hi);
al0 += hi;
ah0 += lo;
VARIANT1_2(ah0, 0);
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
VARIANT1_2(ah0, 0);
ah0 ^= ch;
al0 ^= cl;
idx0 = al0;
}
cn_implode_scratchpad((__m128i*) ctx->memory, (__m128i*) ctx->state0);
keccakf(h0, 24);
extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output);
}

304
algo/cryptonight/cryptonight_av4.c

@ -0,0 +1,304 @@
/* XMRig
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
* Copyright 2012-2014 pooler <pooler@litecoinpool.org>
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <x86intrin.h>
#include <string.h>
#include "crypto/c_keccak.h"
#include "cryptonight.h"
#include "cryptonight_monero.h"
#include "cryptonight_softaes.h"
void cryptonight_av4_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
{
keccak(input, size, ctx[0]->state, 200);
keccak(input + size, size, ctx[1]->state, 200);
const uint8_t* l0 = ctx[0]->memory;
const uint8_t* l1 = ctx[1]->memory;
uint64_t* h0 = (uint64_t*) ctx[0]->state;
uint64_t* h1 = (uint64_t*) ctx[1]->state;
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
uint64_t al0 = h0[0] ^ h0[4];
uint64_t al1 = h1[0] ^ h1[4];
uint64_t ah0 = h0[1] ^ h0[5];
uint64_t ah1 = h1[1] ^ h1[5];
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
uint64_t idx0 = h0[0] ^ h0[4];
uint64_t idx1 = h1[0] ^ h1[4];
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0));
cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1));
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0));
_mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1));
idx0 = EXTRACT64(cx0);
idx1 = EXTRACT64(cx1);
bx0 = cx0;
bx1 = cx1;
uint64_t hi, lo, cl, ch;
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
lo = _umul128(idx0, cl, &hi);
al0 += hi;
ah0 += lo;
((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0;
((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0;
ah0 ^= ch;
al0 ^= cl;
idx0 = al0;
cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
lo = _umul128(idx1, cl, &hi);
al1 += hi;
ah1 += lo;
((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1;
((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1;
ah1 ^= ch;
al1 ^= cl;
idx1 = al1;
}
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
keccakf(h0, 24);
keccakf(h1, 24);
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
}
void cryptonight_av4_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
{
if (size < 43) {
memset(output, 0, 64);
return;
}
keccak(input, size, ctx[0]->state, 200);
keccak(input + size, size, ctx[1]->state, 200);
VARIANT1_INIT(0);
VARIANT1_INIT(1);
const uint8_t* l0 = ctx[0]->memory;
const uint8_t* l1 = ctx[1]->memory;
uint64_t* h0 = (uint64_t*) ctx[0]->state;
uint64_t* h1 = (uint64_t*) ctx[1]->state;
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
uint64_t al0 = h0[0] ^ h0[4];
uint64_t al1 = h1[0] ^ h1[4];
uint64_t ah0 = h0[1] ^ h0[5];
uint64_t ah1 = h1[1] ^ h1[5];
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
uint64_t idx0 = h0[0] ^ h0[4];
uint64_t idx1 = h1[0] ^ h1[4];
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0));
cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1));
cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0));
cryptonight_monero_tweak((uint64_t*)&l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1));
idx0 = EXTRACT64(cx0);
idx1 = EXTRACT64(cx1);
bx0 = cx0;
bx1 = cx1;
uint64_t hi, lo, cl, ch;
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
lo = _umul128(idx0, cl, &hi);
al0 += hi;
ah0 += lo;
((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0;
((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0 ^ tweak1_2_0;
ah0 ^= ch;
al0 ^= cl;
idx0 = al0;
cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
lo = _umul128(idx1, cl, &hi);
al1 += hi;
ah1 += lo;
((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1;
((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1 ^ tweak1_2_1;
ah1 ^= ch;
al1 ^= cl;
idx1 = al1;
}
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
keccakf(h0, 24);
keccakf(h1, 24);
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
}
void cryptonight_av4_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
{
keccak(input, size, ctx[0]->state, 200);
keccak(input + size, size, ctx[1]->state, 200);
const uint8_t* l0 = ctx[0]->memory;
const uint8_t* l1 = ctx[1]->memory;
uint64_t* h0 = (uint64_t*) ctx[0]->state;
uint64_t* h1 = (uint64_t*) ctx[1]->state;
VARIANT2_INIT(0);
VARIANT2_INIT(1);
VARIANT2_SET_ROUNDING_MODE();
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
uint64_t al0 = h0[0] ^ h0[4];
uint64_t al1 = h1[0] ^ h1[4];
uint64_t ah0 = h0[1] ^ h0[5];
uint64_t ah1 = h1[1] ^ h1[5];
__m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
__m128i bx01 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
__m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
__m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]);
uint64_t idx0 = al0;
uint64_t idx1 = al1;
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
const __m128i ax0 = _mm_set_epi64x(ah0, al0);
const __m128i ax1 = _mm_set_epi64x(ah1, al1);
cx0 = soft_aesenc(cx0, ax0);
cx1 = soft_aesenc(cx1, ax1);
VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01);
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx00, cx0));
VARIANT2_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11);
_mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx10, cx1));
idx0 = _mm_cvtsi128_si64(cx0);
idx1 = _mm_cvtsi128_si64(cx1);
uint64_t hi, lo, cl, ch;
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
VARIANT2_INTEGER_MATH(0, cl, cx0);
lo = _umul128(idx0, cl, &hi);
VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, hi, lo);
al0 += hi;
ah0 += lo;
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0;
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0;
al0 ^= cl;
ah0 ^= ch;
idx0 = al0;
cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
VARIANT2_INTEGER_MATH(1, cl, cx1);
lo = _umul128(idx1, cl, &hi);
VARIANT2_SHUFFLE2(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, hi, lo);
al1 += hi;
ah1 += lo;
((uint64_t*)&l1[idx1 & 0x1FFFF0])[0] = al1;
((uint64_t*)&l1[idx1 & 0x1FFFF0])[1] = ah1;
al1 ^= cl;
ah1 ^= ch;
idx1 = al1;
bx01 = bx00;
bx11 = bx10;
bx00 = cx0;
bx10 = cx1;
}
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
keccakf(h0, 24);
keccakf(h1, 24);
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
}

123
algo/cryptonight/cryptonight_av4_softaes_double.c

@ -1,123 +0,0 @@
/* XMRig
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
* Copyright 2012-2014 pooler <pooler@litecoinpool.org>
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <x86intrin.h>
#include <string.h>
#include "crypto/c_keccak.h"
#include "cryptonight.h"
#include "cryptonight_monero.h"
#include "cryptonight_softaes.h"
void cryptonight_av4_softaes_double(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx, uint8_t version)
{
keccak((const uint8_t *) input, size, ctx->state0, 200);
keccak((const uint8_t *) input + size, size, ctx->state1, 200);
VARIANT1_INIT(0);
VARIANT1_INIT(1);
const uint8_t* l0 = ctx->memory;
const uint8_t* l1 = ctx->memory + MEMORY;
uint64_t* h0 = (uint64_t*) ctx->state0;
uint64_t* h1 = (uint64_t*) ctx->state1;
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0);
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1);
uint64_t al0 = h0[0] ^ h0[4];
uint64_t al1 = h1[0] ^ h1[4];
uint64_t ah0 = h0[1] ^ h0[5];
uint64_t ah1 = h1[1] ^ h1[5];
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
uint64_t idx0 = h0[0] ^ h0[4];
uint64_t idx1 = h1[0] ^ h1[4];
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) {
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]);
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]);
cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0));
cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1));
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0));
_mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1));
VARIANT1_1(&l0[idx0 & 0x1FFFF0]);
VARIANT1_1(&l1[idx1 & 0x1FFFF0]);
idx0 = EXTRACT64(cx0);
idx1 = EXTRACT64(cx1);
bx0 = cx0;
bx1 = cx1;
uint64_t hi, lo, cl, ch;
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0];
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1];
lo = _umul128(idx0, cl, &hi);
al0 += hi;
ah0 += lo;
VARIANT1_2(ah0, 0);
((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0;
((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0;
VARIANT1_2(ah0, 0);
ah0 ^= ch;
al0 ^= cl;
idx0 = al0;
cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0];
ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1];
lo = _umul128(idx1, cl, &hi);
al1 += hi;
ah1 += lo;
VARIANT1_2(ah1, 1);
((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1;
((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1;
VARIANT1_2(ah1, 1);
ah1 ^= ch;
al1 ^= cl;
idx1 = al1;
}
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0);
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1);
keccakf(h0, 24);
keccakf(h1, 24);
extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output);
extra_hashes[ctx->state1[0] & 3](ctx->state1, 200, (char*) output + 32);
}

89
algo/cryptonight/cryptonight_monero.h

@ -6,6 +6,7 @@
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
* Copyright 2018 SChernykh <https://github.com/SChernykh>
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
*
* This program is free software: you can redistribute it and/or modify
@ -22,30 +23,80 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __CRYPTONIGHT_MONERO_H__
#define __CRYPTONIGHT_MONERO_H__
#ifndef XMRIG_CRYPTONIGHT_MONERO_H
#define XMRIG_CRYPTONIGHT_MONERO_H
// VARIANT ALTERATIONS
#define VARIANT1_INIT(part) \
uint64_t tweak1_2_##part = 0; \
if (version > 6) { \
tweak1_2_##part = (*(const uint64_t*)(((const uint8_t*) input) + 35 + part * size) ^ \
*((const uint64_t*)(ctx->state##part) + 24)); \
}
#include <fenv.h>
#include <math.h>
static inline __m128i int_sqrt_v2(const uint64_t n0)
{
__m128d x = _mm_castsi128_pd(_mm_add_epi64(_mm_cvtsi64_si128(n0 >> 12), _mm_set_epi64x(0, 1023ULL << 52)));
x = _mm_sqrt_sd(_mm_setzero_pd(), x);
uint64_t r = (uint64_t)(_mm_cvtsi128_si64(_mm_castpd_si128(x)));
const uint64_t s = r >> 20;
r >>= 19;
uint64_t x2 = (s - (1022ULL << 32)) * (r - s - (1022ULL << 32) + 1);
# if (defined(_MSC_VER) || __GNUC__ > 7 || (__GNUC__ == 7 && __GNUC_MINOR__ > 1)) && (defined(__x86_64__) || defined(_M_AMD64))
_addcarry_u64(_subborrow_u64(0, x2, n0, (unsigned long long int*)&x2), r, 0, (unsigned long long int*)&r);
# else
if (x2 < n0) ++r;
# endif
return _mm_cvtsi64_si128(r);
}
#define VARIANT1_1(p) \
if (version > 6) { \
const uint8_t tmp = ((const uint8_t*)(p))[11]; \
static const uint32_t table = 0x75310; \
const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; \
((uint8_t*)(p))[11] = tmp ^ ((table >> index) & 0x30); \
# define VARIANT1_INIT(part) \
uint64_t tweak1_2_##part = (*(const uint64_t*)(input + 35 + part * size) ^ \
*((const uint64_t*)(ctx[part]->state) + 24)); \
# define VARIANT2_INIT(part) \
__m128i division_result_xmm_##part = _mm_cvtsi64_si128(h##part[12]); \
__m128i sqrt_result_xmm_##part = _mm_cvtsi64_si128(h##part[13]);
#ifdef _MSC_VER
# define VARIANT2_SET_ROUNDING_MODE() { _control87(RC_DOWN, MCW_RC); }
#else
# define VARIANT2_SET_ROUNDING_MODE() { fesetround(FE_DOWNWARD); }
#endif
# define VARIANT2_INTEGER_MATH(part, cl, cx) \
{ \
const uint64_t sqrt_result = (uint64_t)(_mm_cvtsi128_si64(sqrt_result_xmm_##part)); \
const uint64_t cx_0 = _mm_cvtsi128_si64(cx); \
cl ^= (uint64_t)(_mm_cvtsi128_si64(division_result_xmm_##part)) ^ (sqrt_result << 32); \
const uint32_t d = (uint32_t)(cx_0 + (sqrt_result << 1)) | 0x80000001UL; \
const uint64_t cx_1 = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \
const uint64_t division_result = (uint32_t)(cx_1 / d) + ((cx_1 % d) << 32); \
division_result_xmm_##part = _mm_cvtsi64_si128((int64_t)(division_result)); \
sqrt_result_xmm_##part = int_sqrt_v2(cx_0 + division_result); \
}
#define VARIANT1_2(p, part) \
if (version > 6) { \
(p) ^= tweak1_2_##part; \
# define VARIANT2_SHUFFLE(base_ptr, offset, _a, _b, _b1) \
{ \
const __m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))); \
const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \
const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30))); \
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \
}
# define VARIANT2_SHUFFLE2(base_ptr, offset, _a, _b, _b1, hi, lo) \
{ \
const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))), _mm_set_epi64x(lo, hi)); \
const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \
hi ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[0]; \
lo ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[1]; \
const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30))); \
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \
}
#endif /* __CRYPTONIGHT_MONERO_H__ */
#endif /* XMRIG_CRYPTONIGHT_MONERO_H */

25
algo/cryptonight/cryptonight_softaes.h

@ -22,10 +22,13 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __CRYPTONIGHT_SOFTAES_H__
#define __CRYPTONIGHT_SOFTAES_H__
#ifndef XMRIG_CRYPTONIGHT_SOFTAES_H
#define XMRIG_CRYPTONIGHT_SOFTAES_H
#include <x86intrin.h>
#include <stdint.h>
extern __m128i soft_aesenc(__m128i in, __m128i key);
extern __m128i soft_aeskeygenassist(__m128i key, uint8_t rcon);
@ -234,4 +237,20 @@ inline uint64_t _umul128(uint64_t multiplier, uint64_t multiplicand, uint64_t *p
#endif
#endif /* __CRYPTONIGHT_SOFTAES_H__ */
static inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
{
mem_out[0] = EXTRACT64(tmp);
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
uint64_t vh = EXTRACT64(tmp);
uint8_t x = vh >> 24;
static const uint16_t table = 0x7531;
const uint8_t index = (((x >> 3) & 6) | (x & 1)) << 1;
vh ^= ((table >> index) & 0x3) << 28;
mem_out[1] = vh;
}
#endif /* XMRIG_CRYPTONIGHT_SOFTAES_H */

66
algo/cryptonight/cryptonight_test.h

@ -6,6 +6,7 @@
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
* Copyright 2018 SChernykh <https://github.com/SChernykh>
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
*
* This program is free software: you can redistribute it and/or modify
@ -22,49 +23,68 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __CRYPTONIGHT_TEST_H__
#define __CRYPTONIGHT_TEST_H__
#ifndef XMRIG_CRYPTONIGHT_TEST_H
#define XMRIG_CRYPTONIGHT_TEST_H
const static uint8_t test_input[152] = {
0x01, 0x00, 0xFB, 0x8E, 0x8A, 0xC8, 0x05, 0x89, 0x93, 0x23, 0x37, 0x1B, 0xB7, 0x90, 0xDB, 0x19,
0x21, 0x8A, 0xFD, 0x8D, 0xB8, 0xE3, 0x75, 0x5D, 0x8B, 0x90, 0xF3, 0x9B, 0x3D, 0x55, 0x06, 0xA9,
0xAB, 0xCE, 0x4F, 0xA9, 0x12, 0x24, 0x45, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x81, 0x46, 0xD4, 0x9F,
0xA9, 0x3E, 0xE7, 0x24, 0xDE, 0xB5, 0x7D, 0x12, 0xCB, 0xC6, 0xC6, 0xF3, 0xB9, 0x24, 0xD9, 0x46,
0x12, 0x7C, 0x7A, 0x97, 0x41, 0x8F, 0x93, 0x48, 0x82, 0x8F, 0x0F, 0x02,
0x03, 0x05, 0xA0, 0xDB, 0xD6, 0xBF, 0x05, 0xCF, 0x16, 0xE5, 0x03, 0xF3, 0xA6, 0x6F, 0x78, 0x00,
0x7C, 0xBF, 0x34, 0x14, 0x43, 0x32, 0xEC, 0xBF, 0xC2, 0x2E, 0xD9, 0x5C, 0x87, 0x00, 0x38, 0x3B,
0x30, 0x9A, 0xCE, 0x19, 0x23, 0xA0, 0x96, 0x4B, 0x00, 0x00, 0x00, 0x08, 0xBA, 0x93, 0x9A, 0x62,
0x72, 0x4C, 0x0D, 0x75, 0x81, 0xFC, 0xE5, 0x76, 0x1E, 0x9D, 0x8A, 0x0E, 0x6A, 0x1C, 0x3F, 0x92,
0x4F, 0xDD, 0x84, 0x93, 0xD1, 0x11, 0x56, 0x49, 0xC0, 0x5E, 0xB6, 0x01
0x4F, 0xDD, 0x84, 0x93, 0xD1, 0x11, 0x56, 0x49, 0xC0, 0x5E, 0xB6, 0x01,
0x01, 0x00, 0xFB, 0x8E, 0x8A, 0xC8, 0x05, 0x89, 0x93, 0x23, 0x37, 0x1B, 0xB7, 0x90, 0xDB, 0x19,
0x21, 0x8A, 0xFD, 0x8D, 0xB8, 0xE3, 0x75, 0x5D, 0x8B, 0x90, 0xF3, 0x9B, 0x3D, 0x55, 0x06, 0xA9,
0xAB, 0xCE, 0x4F, 0xA9, 0x12, 0x24, 0x45, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x81, 0x46, 0xD4, 0x9F,
0xA9, 0x3E, 0xE7, 0x24, 0xDE, 0xB5, 0x7D, 0x12, 0xCB, 0xC6, 0xC6, 0xF3, 0xB9, 0x24, 0xD9, 0x46,
0x12, 0x7C, 0x7A, 0x97, 0x41, 0x8F, 0x93, 0x48, 0x82, 0x8F, 0x0F, 0x02
};
const static uint8_t test_output0[64] = {
0x1B, 0x60, 0x6A, 0x3F, 0x4A, 0x07, 0xD6, 0x48, 0x9A, 0x1B, 0xCD, 0x07, 0x69, 0x7B, 0xD1, 0x66,
0x96, 0xB6, 0x1C, 0x8A, 0xE9, 0x82, 0xF6, 0x1A, 0x90, 0x16, 0x0F, 0x4E, 0x52, 0x82, 0x8A, 0x7F,
const static uint8_t test_output_v0[64] = {
0x1A, 0x3F, 0xFB, 0xEE, 0x90, 0x9B, 0x42, 0x0D, 0x91, 0xF7, 0xBE, 0x6E, 0x5F, 0xB5, 0x6D, 0xB7,
0x1B, 0x31, 0x10, 0xD8, 0x86, 0x01, 0x1E, 0x87, 0x7E, 0xE5, 0x78, 0x6A, 0xFD, 0x08, 0x01, 0x00
0x1B, 0x31, 0x10, 0xD8, 0x86, 0x01, 0x1E, 0x87, 0x7E, 0xE5, 0x78, 0x6A, 0xFD, 0x08, 0x01, 0x00,
0x1B, 0x60, 0x6A, 0x3F, 0x4A, 0x07, 0xD6, 0x48, 0x9A, 0x1B, 0xCD, 0x07, 0x69, 0x7B, 0xD1, 0x66,
0x96, 0xB6, 0x1C, 0x8A, 0xE9, 0x82, 0xF6, 0x1A, 0x90, 0x16, 0x0F, 0x4E, 0x52, 0x82, 0x8A, 0x7F
};
// Cryptonight variant 1 (Monero v7)
const static uint8_t test_output_v1[64] = {
0xF2, 0x2D, 0x3D, 0x62, 0x03, 0xD2, 0xA0, 0x8B, 0x41, 0xD9, 0x02, 0x72, 0x78, 0xD8, 0xBC, 0xC9,
0x83, 0xAC, 0xAD, 0xA9, 0xB6, 0x8E, 0x52, 0xE3, 0xC6, 0x89, 0x69, 0x2A, 0x50, 0xE9, 0x21, 0xD9,
0xC9, 0xFA, 0xE8, 0x42, 0x5D, 0x86, 0x88, 0xDC, 0x23, 0x6B, 0xCD, 0xBC, 0x42, 0xFD, 0xB4, 0x2D,
0x37, 0x6C, 0x6E, 0xC1, 0x90, 0x50, 0x1A, 0xA8, 0x4B, 0x04, 0xA4, 0xB4, 0xCF, 0x1E, 0xE1, 0x22
};
// Cryptonight variant 2 (Monero v8)
const static uint8_t test_output_v2[64] = {
0x97, 0x37, 0x82, 0x82, 0xCF, 0x10, 0xE7, 0xAD, 0x03, 0x3F, 0x7B, 0x80, 0x74, 0xC4, 0x0E, 0x14,
0xD0, 0x6E, 0x7F, 0x60, 0x9D, 0xDD, 0xDA, 0x78, 0x76, 0x80, 0xB5, 0x8C, 0x05, 0xF4, 0x3D, 0x21,
0x87, 0x1F, 0xCD, 0x68, 0x23, 0xF6, 0xA8, 0x79, 0xBB, 0x3F, 0x33, 0x95, 0x1C, 0x8E, 0x8E, 0x89,
0x1D, 0x40, 0x43, 0x88, 0x0B, 0x02, 0xDF, 0xA1, 0xBB, 0x3B, 0xE4, 0x98, 0xB5, 0x0E, 0x75, 0x78
};
#ifndef XMRIG_NO_AEON
const static uint8_t test_output1[64] = {
0x28, 0xA2, 0x2B, 0xAD, 0x3F, 0x93, 0xD1, 0x40, 0x8F, 0xCA, 0x47, 0x2E, 0xB5, 0xAD, 0x1C, 0xBE,
0x75, 0xF2, 0x1D, 0x05, 0x3C, 0x8C, 0xE5, 0xB3, 0xAF, 0x10, 0x5A, 0x57, 0x71, 0x3E, 0x21, 0xDD,
const static uint8_t test_output_v0_lite[64] = {
0x36, 0x95, 0xB4, 0xB5, 0x3B, 0xB0, 0x03, 0x58, 0xB0, 0xAD, 0x38, 0xDC, 0x16, 0x0F, 0xEB, 0x9E,
0x00, 0x4E, 0xEC, 0xE0, 0x9B, 0x83, 0xA7, 0x2E, 0xF6, 0xBA, 0x98, 0x64, 0xD3, 0x51, 0x0C, 0x88,
0x28, 0xA2, 0x2B, 0xAD, 0x3F, 0x93, 0xD1, 0x40, 0x8F, 0xCA, 0x47, 0x2E, 0xB5, 0xAD, 0x1C, 0xBE,
0x75, 0xF2, 0x1D, 0x05, 0x3C, 0x8C, 0xE5, 0xB3, 0xAF, 0x10, 0x5A, 0x57, 0x71, 0x3E, 0x21, 0xDD
};
#endif
// Monero v7
const static uint8_t test_output2[64] = {
0xC9, 0xFA, 0xE8, 0x42, 0x5D, 0x86, 0x88, 0xDC, 0x23, 0x6B, 0xCD, 0xBC, 0x42, 0xFD, 0xB4, 0x2D,
0x37, 0x6C, 0x6E, 0xC1, 0x90, 0x50, 0x1A, 0xA8, 0x4B, 0x04, 0xA4, 0xB4, 0xCF, 0x1E, 0xE1, 0x22,
0xF2, 0x2D, 0x3D, 0x62, 0x03, 0xD2, 0xA0, 0x8B, 0x41, 0xD9, 0x02, 0x72, 0x78, 0xD8, 0xBC, 0xC9,
0x83, 0xAC, 0xAD, 0xA9, 0xB6, 0x8E, 0x52, 0xE3, 0xC6, 0x89, 0x69, 0x2A, 0x50, 0xE9, 0x21, 0xD9
// AEON v7
const static uint8_t test_output_v1_lite[64] = {
0x6D, 0x8C, 0xDC, 0x44, 0x4E, 0x9B, 0xBB, 0xFD, 0x68, 0xFC, 0x43, 0xFC, 0xD4, 0x85, 0x5B, 0x22,
0x8C, 0x8A, 0x1B, 0xD9, 0x1D, 0x9D, 0x00, 0x28, 0x5B, 0xEC, 0x02, 0xB7, 0xCA, 0x2D, 0x67, 0x41,
0x87, 0xC4, 0xE5, 0x70, 0x65, 0x3E, 0xB4, 0xC2, 0xB4, 0x2B, 0x7A, 0x0D, 0x54, 0x65, 0x59, 0x45,
0x2D, 0xFA, 0xB5, 0x73, 0xB8, 0x2E, 0xC5, 0x2F, 0x15, 0x2B, 0x7F, 0xF9, 0x8E, 0x79, 0x44, 0x6F
};
#endif
#endif /* __CRYPTONIGHT_TEST_H__ */
#endif /* XMRIG_CRYPTONIGHT_TEST_H */

33
cmake/asm.cmake

@ -0,0 +1,33 @@
if (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8)
set(XMRIG_ASM_LIBRARY "xmrig-asm")
if (CMAKE_C_COMPILER_ID MATCHES MSVC)
enable_language(ASM_MASM)
if (MSVC_TOOLSET_VERSION GREATER_EQUAL 141)
set(XMRIG_ASM_FILE "crypto/asm/cnv2_main_loop.asm")
else()
set(XMRIG_ASM_FILE "crypto/asm/win64/cnv2_main_loop.asm")
endif()
set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY ASM_MASM)
else()
enable_language(ASM)
if (WIN32 AND CMAKE_C_COMPILER_ID MATCHES GNU)
set(XMRIG_ASM_FILE "crypto/asm/win64/cnv2_main_loop.S")
else()
set(XMRIG_ASM_FILE "crypto/asm/cnv2_main_loop.S")
endif()
set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY C)
endif()
add_library(${XMRIG_ASM_LIBRARY} STATIC ${XMRIG_ASM_FILE})
set(XMRIG_ASM_SOURCES "")
set_property(TARGET ${XMRIG_ASM_LIBRARY} PROPERTY LINKER_LANGUAGE C)
else()
set(XMRIG_ASM_SOURCES "")
set(XMRIG_ASM_LIBRARY "")
add_definitions(/DXMRIG_NO_ASM)
endif()

10
cpu.c

@ -31,6 +31,7 @@
#endif
#include "cpu.h"
#include "options.h"
#ifndef BUILD_TEST
@ -63,6 +64,15 @@ void cpu_init_common() {
if (data.flags[CPU_FEATURE_AES]) {
cpu_info.flags |= CPU_FLAG_AES;
# ifndef XMRIG_NO_ASM
if (data.vendor == VENDOR_AMD) {
cpu_info.assembly = ASM_RYZEN;
}
else if (data.vendor == VENDOR_INTEL) {
cpu_info.assembly = ASM_INTEL;
}
# endif
}
if (data.flags[CPU_FEATURE_BMI2]) {

7
cpu.h

@ -21,8 +21,8 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __CPU_H__
#define __CPU_H__
#ifndef XMRIG_CPU_H
#define XMRIG_CPU_H
#include <stdbool.h>
@ -34,6 +34,7 @@ struct cpu_info {
int l2_cache;
int l3_cache;
char brand[64];
int assembly;
};
extern struct cpu_info cpu_info;
@ -50,4 +51,4 @@ void cpu_init();
int get_optimal_threads_count(int algo, bool double_hash, int max_cpu_usage);
int affine_to_cpu_mask(int id, unsigned long mask);
#endif /* __CPU_H__ */
#endif /* XMRIG_CPU_H */

28
cpu_stub.c

@ -24,7 +24,11 @@
#include <cpuid.h>
#include <string.h>
#include <stdbool.h>
#include <stdint.h>
#include "cpu.h"
#include "options.h"
#define VENDOR_ID (0)
@ -53,7 +57,7 @@ static inline void cpuid(int level, int output[4]) {
static void cpu_brand_string(char* s) {
int cpu_info[4] = { 0 };
int32_t cpu_info[4] = { 0 };
cpuid(VENDOR_ID, cpu_info);
if (cpu_info[EAX_Reg] >= 4) {
@ -68,7 +72,7 @@ static void cpu_brand_string(char* s) {
static bool has_aes_ni()
{
int cpu_info[4] = { 0 };
int32_t cpu_info[4] = { 0 };
cpuid(PROCESSOR_INFO, cpu_info);
return cpu_info[ECX_Reg] & bit_AES;
@ -76,7 +80,7 @@ static bool has_aes_ni()
static bool has_bmi2() {
int cpu_info[4] = { 0 };
int32_t cpu_info[4] = { 0 };
cpuid(EXTENDED_FEATURES, cpu_info);
return cpu_info[EBX_Reg] & bit_BMI2;
@ -93,6 +97,24 @@ void cpu_init_common() {
if (has_aes_ni()) {
cpu_info.flags |= CPU_FLAG_AES;
# ifndef XMRIG_NO_ASM
char vendor[13] = { 0 };
int32_t data[4] = { 0 };
cpuid(0, data);
memcpy(vendor + 0, &data[1], 4);
memcpy(vendor + 4, &data[3], 4);
memcpy(vendor + 8, &data[2], 4);
if (memcmp(vendor, "GenuineIntel", 12) == 0) {
cpu_info.assembly = ASM_INTEL;
}
else if (memcmp(vendor, "AuthenticAMD", 12) == 0) {
cpu_info.assembly = ASM_RYZEN;
}
# endif
}
if (has_bmi2()) {

410
crypto/asm/cnv2_double_main_loop_sandybridge.inc

@ -0,0 +1,410 @@
mov rax, rsp
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 184
stmxcsr DWORD PTR [rsp+272]
mov DWORD PTR [rsp+276], 24448
ldmxcsr DWORD PTR [rsp+276]
mov r13, QWORD PTR [rcx+224]
mov r9, rdx
mov r10, QWORD PTR [rcx+32]
mov r8, rcx
xor r10, QWORD PTR [rcx]
mov r14d, 524288
mov r11, QWORD PTR [rcx+40]
xor r11, QWORD PTR [rcx+8]
mov rsi, QWORD PTR [rdx+224]
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov rdi, QWORD PTR [r9+32]
xor rdi, QWORD PTR [r9]
mov rbp, QWORD PTR [r9+40]
xor rbp, QWORD PTR [r9+8]
movq xmm0, rdx
movaps XMMWORD PTR [rax-88], xmm6
movaps XMMWORD PTR [rax-104], xmm7
movaps XMMWORD PTR [rax-120], xmm8
movaps XMMWORD PTR [rsp+112], xmm9
movaps XMMWORD PTR [rsp+96], xmm10
movaps XMMWORD PTR [rsp+80], xmm11
movaps XMMWORD PTR [rsp+64], xmm12
movaps XMMWORD PTR [rsp+48], xmm13
movaps XMMWORD PTR [rsp+32], xmm14
movaps XMMWORD PTR [rsp+16], xmm15
mov rdx, r10
movq xmm4, QWORD PTR [r8+96]
and edx, 2097136
mov rax, QWORD PTR [rcx+48]
xorps xmm13, xmm13
xor rax, QWORD PTR [rcx+16]
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r8+72]
movq xmm5, QWORD PTR [r8+104]
movq xmm7, rax
mov eax, 1
shl rax, 52
movq xmm14, rax
punpcklqdq xmm14, xmm14
mov eax, 1023
shl rax, 52
movq xmm12, rax
punpcklqdq xmm12, xmm12
mov rax, QWORD PTR [r8+80]
xor rax, QWORD PTR [r8+64]
punpcklqdq xmm7, xmm0
movq xmm0, rcx
mov rcx, QWORD PTR [r9+56]
xor rcx, QWORD PTR [r9+24]
movq xmm3, rax
mov rax, QWORD PTR [r9+48]
xor rax, QWORD PTR [r9+16]
punpcklqdq xmm3, xmm0
movq xmm0, rcx
mov QWORD PTR [rsp], r13
mov rcx, QWORD PTR [r9+88]
xor rcx, QWORD PTR [r9+72]
movq xmm6, rax
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
punpcklqdq xmm6, xmm0
movq xmm0, rcx
mov QWORD PTR [rsp+256], r10
mov rcx, rdi
mov QWORD PTR [rsp+264], r11
movq xmm8, rax
and ecx, 2097136
punpcklqdq xmm8, xmm0
movq xmm0, QWORD PTR [r9+96]
punpcklqdq xmm4, xmm0
movq xmm0, QWORD PTR [r9+104]
lea r8, QWORD PTR [rcx+rsi]
movdqu xmm11, XMMWORD PTR [r8]
punpcklqdq xmm5, xmm0
lea r9, QWORD PTR [rdx+r13]
movdqu xmm15, XMMWORD PTR [r9]
ALIGN 16
main_loop_double_sandybridge:
movdqu xmm9, xmm15
mov eax, edx
mov ebx, edx
xor eax, 16
xor ebx, 32
xor edx, 48
movq xmm0, r11
movq xmm2, r10
punpcklqdq xmm2, xmm0
aesenc xmm9, xmm2
movdqu xmm0, XMMWORD PTR [rax+r13]
movdqu xmm1, XMMWORD PTR [rbx+r13]
paddq xmm0, xmm7
paddq xmm1, xmm2
movdqu XMMWORD PTR [rbx+r13], xmm0
movdqu xmm0, XMMWORD PTR [rdx+r13]
movdqu XMMWORD PTR [rdx+r13], xmm1
paddq xmm0, xmm3
movdqu XMMWORD PTR [rax+r13], xmm0
movq r11, xmm9
mov edx, r11d
and edx, 2097136
movdqa xmm0, xmm9
pxor xmm0, xmm7
movdqu XMMWORD PTR [r9], xmm0
lea rbx, QWORD PTR [rdx+r13]
mov r10, QWORD PTR [rdx+r13]
movdqu xmm10, xmm11
movq xmm0, rbp
movq xmm11, rdi
punpcklqdq xmm11, xmm0
aesenc xmm10, xmm11
mov eax, ecx
mov r12d, ecx
xor eax, 16
xor r12d, 32
xor ecx, 48
movdqu xmm0, XMMWORD PTR [rax+rsi]
paddq xmm0, xmm6
movdqu xmm1, XMMWORD PTR [r12+rsi]
movdqu XMMWORD PTR [r12+rsi], xmm0
paddq xmm1, xmm11
movdqu xmm0, XMMWORD PTR [rcx+rsi]
movdqu XMMWORD PTR [rcx+rsi], xmm1
paddq xmm0, xmm8
movdqu XMMWORD PTR [rax+rsi], xmm0
movq rcx, xmm10
and ecx, 2097136
movdqa xmm0, xmm10
pxor xmm0, xmm6
movdqu XMMWORD PTR [r8], xmm0
mov r12, QWORD PTR [rcx+rsi]
mov r9, QWORD PTR [rbx+8]
xor edx, 16
mov r8d, edx
mov r15d, edx
movq rdx, xmm5
shl rdx, 32
movq rax, xmm4
xor rdx, rax
xor r10, rdx
mov rax, r10
mul r11
mov r11d, r8d
xor r11d, 48
movq xmm0, rdx
xor rdx, [r11+r13]
movq xmm1, rax
xor rax, [r11+r13+8]
punpcklqdq xmm0, xmm1
pxor xmm0, XMMWORD PTR [r8+r13]
xor r8d, 32
movdqu xmm1, XMMWORD PTR [r11+r13]
paddq xmm0, xmm7
paddq xmm1, xmm2
movdqu XMMWORD PTR [r11+r13], xmm0
movdqu xmm0, XMMWORD PTR [r8+r13]
movdqu XMMWORD PTR [r8+r13], xmm1
paddq xmm0, xmm3
movdqu XMMWORD PTR [r15+r13], xmm0
mov r11, QWORD PTR [rsp+256]
add r11, rdx
mov rdx, QWORD PTR [rsp+264]
add rdx, rax
mov QWORD PTR [rbx], r11
xor r11, r10
mov QWORD PTR [rbx+8], rdx
xor rdx, r9
mov QWORD PTR [rsp+256], r11
and r11d, 2097136
mov QWORD PTR [rsp+264], rdx
mov QWORD PTR [rsp+8], r11
lea r15, QWORD PTR [r11+r13]
movdqu xmm15, XMMWORD PTR [r11+r13]
lea r13, QWORD PTR [rsi+rcx]
movdqa xmm0, xmm5
psrldq xmm0, 8
movaps xmm2, xmm13
movq r10, xmm0
psllq xmm5, 1
shl r10, 32
movdqa xmm0, xmm9
psrldq xmm0, 8
movdqa xmm1, xmm10
movq r11, xmm0
psrldq xmm1, 8
movq r8, xmm1
psrldq xmm4, 8
movaps xmm0, xmm13
movq rax, xmm4
xor r10, rax
movaps xmm1, xmm13
xor r10, r12
lea rax, QWORD PTR [r11+1]
shr rax, 1
movdqa xmm3, xmm9
punpcklqdq xmm3, xmm10
paddq xmm5, xmm3
movq rdx, xmm5
psrldq xmm5, 8
cvtsi2sd xmm2, rax
or edx, -2147483647
lea rax, QWORD PTR [r8+1]
shr rax, 1
movq r9, xmm5
cvtsi2sd xmm0, rax
or r9d, -2147483647
cvtsi2sd xmm1, rdx
unpcklpd xmm2, xmm0
movaps xmm0, xmm13
cvtsi2sd xmm0, r9
unpcklpd xmm1, xmm0
divpd xmm2, xmm1
paddq xmm2, xmm14
cvttsd2si rax, xmm2
psrldq xmm2, 8
mov rbx, rax
imul rax, rdx
sub r11, rax
js div_fix_1_sandybridge
div_fix_1_ret_sandybridge:
cvttsd2si rdx, xmm2
mov rax, rdx
imul rax, r9
movd xmm2, r11d
movd xmm4, ebx
sub r8, rax
js div_fix_2_sandybridge
div_fix_2_ret_sandybridge:
movd xmm1, r8d
movd xmm0, edx
punpckldq xmm2, xmm1
punpckldq xmm4, xmm0
punpckldq xmm4, xmm2
paddq xmm3, xmm4
movdqa xmm0, xmm3
psrlq xmm0, 12
paddq xmm0, xmm12
sqrtpd xmm1, xmm0
movq r9, xmm1
movdqa xmm5, xmm1
psrlq xmm5, 19
test r9, 524287
je sqrt_fix_1_sandybridge
sqrt_fix_1_ret_sandybridge:
movq r9, xmm10
psrldq xmm1, 8
movq r8, xmm1
test r8, 524287
je sqrt_fix_2_sandybridge
sqrt_fix_2_ret_sandybridge:
mov r12d, ecx
mov r8d, ecx
xor r12d, 16
xor r8d, 32
xor ecx, 48
mov rax, r10
mul r9
movq xmm0, rax
movq xmm3, rdx
punpcklqdq xmm3, xmm0
movdqu xmm0, XMMWORD PTR [r12+rsi]
pxor xmm0, xmm3
movdqu xmm1, XMMWORD PTR [r8+rsi]
xor rdx, [r8+rsi]
xor rax, [r8+rsi+8]
movdqu xmm3, XMMWORD PTR [rcx+rsi]
paddq xmm0, xmm6
paddq xmm1, xmm11
paddq xmm3, xmm8
movdqu XMMWORD PTR [r8+rsi], xmm0
movdqu XMMWORD PTR [rcx+rsi], xmm1
movdqu XMMWORD PTR [r12+rsi], xmm3
add rdi, rdx
mov QWORD PTR [r13], rdi
xor rdi, r10
mov ecx, edi
and ecx, 2097136
lea r8, QWORD PTR [rcx+rsi]
mov rdx, QWORD PTR [r13+8]
add rbp, rax
mov QWORD PTR [r13+8], rbp
movdqu xmm11, XMMWORD PTR [rcx+rsi]
xor rbp, rdx
mov r13, QWORD PTR [rsp]
movdqa xmm3, xmm7
mov rdx, QWORD PTR [rsp+8]
movdqa xmm8, xmm6
mov r10, QWORD PTR [rsp+256]
movdqa xmm7, xmm9
mov r11, QWORD PTR [rsp+264]
movdqa xmm6, xmm10
mov r9, r15
dec r14d
jne main_loop_double_sandybridge
ldmxcsr DWORD PTR [rsp+272]
movaps xmm13, XMMWORD PTR [rsp+48]
lea r11, QWORD PTR [rsp+184]
movaps xmm6, XMMWORD PTR [r11-24]
movaps xmm7, XMMWORD PTR [r11-40]
movaps xmm8, XMMWORD PTR [r11-56]
movaps xmm9, XMMWORD PTR [r11-72]
movaps xmm10, XMMWORD PTR [r11-88]
movaps xmm11, XMMWORD PTR [r11-104]
movaps xmm12, XMMWORD PTR [r11-120]
movaps xmm14, XMMWORD PTR [rsp+32]
movaps xmm15, XMMWORD PTR [rsp+16]
mov rsp, r11
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx
jmp cnv2_double_mainloop_asm_sandybridge_endp
div_fix_1_sandybridge:
dec rbx
add r11, rdx
jmp div_fix_1_ret_sandybridge
div_fix_2_sandybridge:
dec rdx
add r8, r9
jmp div_fix_2_ret_sandybridge
sqrt_fix_1_sandybridge:
movq r8, xmm3
movdqa xmm0, xmm5
psrldq xmm0, 8
dec r9
mov r11d, -1022
shl r11, 32
mov rax, r9
shr r9, 19
shr rax, 20
mov rdx, r9
sub rdx, rax
lea rdx, [rdx+r11+1]
add rax, r11
imul rdx, rax
sub rdx, r8
adc r9, 0
movq xmm5, r9
punpcklqdq xmm5, xmm0
jmp sqrt_fix_1_ret_sandybridge
sqrt_fix_2_sandybridge:
psrldq xmm3, 8
movq r11, xmm3
dec r8
mov ebx, -1022
shl rbx, 32
mov rax, r8
shr r8, 19
shr rax, 20
mov rdx, r8
sub rdx, rax
lea rdx, [rdx+rbx+1]
add rax, rbx
imul rdx, rax
sub rdx, r11
adc r8, 0
movq xmm0, r8
punpcklqdq xmm5, xmm0
jmp sqrt_fix_2_ret_sandybridge
cnv2_double_mainloop_asm_sandybridge_endp:

37
crypto/asm/cnv2_main_loop.S

@ -0,0 +1,37 @@
#define ALIGN .align
.intel_syntax noprefix
#ifdef __APPLE__
# define FN_PREFIX(fn) _ ## fn
.text
#else
# define FN_PREFIX(fn) fn
.section .text
#endif
.global FN_PREFIX(cnv2_mainloop_ivybridge_asm)
.global FN_PREFIX(cnv2_mainloop_ryzen_asm)
.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm)
ALIGN 16
FN_PREFIX(cnv2_mainloop_ivybridge_asm):
sub rsp, 48
mov rcx, rdi
#include "cnv2_main_loop_ivybridge.inc"
add rsp, 48
ret 0
ALIGN 16
FN_PREFIX(cnv2_mainloop_ryzen_asm):
sub rsp, 48
mov rcx, rdi
#include "cnv2_main_loop_ryzen.inc"
add rsp, 48
ret 0
ALIGN 16
FN_PREFIX(cnv2_double_mainloop_sandybridge_asm):
sub rsp, 48
mov rcx, rdi
mov rdx, rsi
#include "cnv2_double_main_loop_sandybridge.inc"
add rsp, 48
ret 0

25
crypto/asm/cnv2_main_loop.asm

@ -0,0 +1,25 @@
_TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE
PUBLIC cnv2_mainloop_ivybridge_asm
PUBLIC cnv2_mainloop_ryzen_asm
PUBLIC cnv2_double_mainloop_sandybridge_asm
ALIGN 64
cnv2_mainloop_ivybridge_asm PROC
INCLUDE cnv2_main_loop_ivybridge.inc
ret 0
cnv2_mainloop_ivybridge_asm ENDP
ALIGN 64
cnv2_mainloop_ryzen_asm PROC
INCLUDE cnv2_main_loop_ryzen.inc
ret 0
cnv2_mainloop_ryzen_asm ENDP
ALIGN 64
cnv2_double_mainloop_sandybridge_asm PROC
INCLUDE cnv2_double_main_loop_sandybridge.inc
ret 0
cnv2_double_mainloop_sandybridge_asm ENDP
_TEXT_CNV2_MAINLOOP ENDS
END

186
crypto/asm/cnv2_main_loop_ivybridge.inc

@ -0,0 +1,186 @@
mov QWORD PTR [rsp+24], rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 80
stmxcsr DWORD PTR [rsp]
mov DWORD PTR [rsp+4], 24448
ldmxcsr DWORD PTR [rsp+4]
mov rax, QWORD PTR [rcx+48]
mov r9, rcx
xor rax, QWORD PTR [rcx+16]
mov esi, 524288
mov r8, QWORD PTR [rcx+32]
mov r13d, -2147483647
xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40]
mov r10, r8
mov rdx, QWORD PTR [rcx+56]
movq xmm4, rax
xor rdx, QWORD PTR [rcx+24]
xor r11, QWORD PTR [rcx+8]
mov rbx, QWORD PTR [rcx+224]
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
movq xmm0, rdx
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72]
movq xmm3, QWORD PTR [r9+104]
movaps XMMWORD PTR [rsp+64], xmm6
movaps XMMWORD PTR [rsp+48], xmm7
movaps XMMWORD PTR [rsp+32], xmm8
and r10d, 2097136
movq xmm5, rax
xor eax, eax
mov QWORD PTR [rsp+16], rax
mov ax, 1023
shl rax, 52
movq xmm8, rax
mov r15, QWORD PTR [r9+96]
punpcklqdq xmm4, xmm0
movq xmm0, rcx
punpcklqdq xmm5, xmm0
movdqu xmm6, XMMWORD PTR [r10+rbx]
ALIGN 16
main_loop_ivybridge:
lea rdx, QWORD PTR [r10+rbx]
mov ecx, r10d
mov eax, r10d
mov rdi, r15
xor ecx, 16
xor eax, 32
xor r10d, 48
movq xmm0, r11
movq xmm7, r8
punpcklqdq xmm7, xmm0
aesenc xmm6, xmm7
movq rbp, xmm6
mov r9, rbp
and r9d, 2097136
movdqu xmm2, XMMWORD PTR [rcx+rbx]
movdqu xmm1, XMMWORD PTR [rax+rbx]
movdqu xmm0, XMMWORD PTR [r10+rbx]
paddq xmm1, xmm7
paddq xmm0, xmm5
paddq xmm2, xmm4
movdqu XMMWORD PTR [rcx+rbx], xmm0
movdqu XMMWORD PTR [rax+rbx], xmm2
movdqu XMMWORD PTR [r10+rbx], xmm1
mov r10, r9
xor r10d, 32
movq rcx, xmm3
mov rax, rcx
shl rax, 32
xor rdi, rax
movdqa xmm0, xmm6
pxor xmm0, xmm4
movdqu XMMWORD PTR [rdx], xmm0
xor rdi, QWORD PTR [r9+rbx]
lea r14, QWORD PTR [r9+rbx]
mov r12, QWORD PTR [r14+8]
xor edx, edx
lea r9d, DWORD PTR [ecx+ecx]
add r9d, ebp
movdqa xmm0, xmm6
psrldq xmm0, 8
or r9d, r13d
movq rax, xmm0
div r9
xorps xmm3, xmm3
mov eax, eax
shl rdx, 32
add rdx, rax
lea r9, QWORD PTR [rdx+rbp]
mov r15, rdx
mov rax, r9
shr rax, 12
movq xmm0, rax
paddq xmm0, xmm8
sqrtsd xmm3, xmm0
psubq xmm3, XMMWORD PTR [rsp+16]
movq rdx, xmm3
test edx, 524287
je sqrt_fixup_ivybridge
psrlq xmm3, 19
sqrt_fixup_ivybridge_ret:
mov ecx, r10d
mov rax, rdi
mul rbp
movq xmm2, rdx
xor rdx, [rcx+rbx]
add r8, rdx
mov QWORD PTR [r14], r8
xor r8, rdi
mov edi, r8d
and edi, 2097136
movq xmm0, rax
xor rax, [rcx+rbx+8]
add r11, rax
mov QWORD PTR [r14+8], r11
punpcklqdq xmm2, xmm0
mov r9d, r10d
xor r9d, 48
xor r10d, 16
pxor xmm2, XMMWORD PTR [r9+rbx]
movdqu xmm0, XMMWORD PTR [r10+rbx]
paddq xmm0, xmm5
movdqu xmm1, XMMWORD PTR [rcx+rbx]
paddq xmm2, xmm4
paddq xmm1, xmm7
movdqa xmm5, xmm4
movdqu XMMWORD PTR [r9+rbx], xmm0
movdqa xmm4, xmm6
movdqu XMMWORD PTR [rcx+rbx], xmm2
movdqu XMMWORD PTR [r10+rbx], xmm1
movdqu xmm6, [rdi+rbx]
mov r10d, edi
xor r11, r12
dec rsi
jne main_loop_ivybridge
ldmxcsr DWORD PTR [rsp]
mov rbx, QWORD PTR [rsp+160]
movaps xmm6, XMMWORD PTR [rsp+64]
movaps xmm7, XMMWORD PTR [rsp+48]
movaps xmm8, XMMWORD PTR [rsp+32]
add rsp, 80
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
jmp cnv2_main_loop_ivybridge_endp
sqrt_fixup_ivybridge:
dec rdx
mov r13d, -1022
shl r13, 32
mov rax, rdx
shr rdx, 19
shr rax, 20
mov rcx, rdx
sub rcx, rax
add rax, r13
not r13
sub rcx, r13
mov r13d, -2147483647
imul rcx, rax
sub rcx, r9
adc rdx, 0
movq xmm3, rdx
jmp sqrt_fixup_ivybridge_ret
cnv2_main_loop_ivybridge_endp:

179
crypto/asm/cnv2_main_loop_ryzen.inc

@ -0,0 +1,179 @@
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 64
stmxcsr DWORD PTR [rsp]
mov DWORD PTR [rsp+4], 24448
ldmxcsr DWORD PTR [rsp+4]
mov rax, QWORD PTR [rcx+48]
mov r9, rcx
xor rax, QWORD PTR [rcx+16]
mov ebp, 524288
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40]
mov r10, r8
mov rdx, QWORD PTR [rcx+56]
movq xmm3, rax
xor rdx, QWORD PTR [rcx+24]
xor r11, QWORD PTR [rcx+8]
mov rbx, QWORD PTR [rcx+224]
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
movq xmm0, rdx
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72]
mov rdi, QWORD PTR [r9+104]
and r10d, 2097136
movaps XMMWORD PTR [rsp+48], xmm6
movq xmm4, rax
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
xorps xmm8, xmm8
mov ax, 1023
shl rax, 52
movq xmm7, rax
mov r15, QWORD PTR [r9+96]
punpcklqdq xmm3, xmm0
movq xmm0, rcx
punpcklqdq xmm4, xmm0
ALIGN 16
main_loop_ryzen:
movdqa xmm5, XMMWORD PTR [r10+rbx]
movq xmm0, r11
movq xmm6, r8
punpcklqdq xmm6, xmm0
lea rdx, QWORD PTR [r10+rbx]
lea r9, QWORD PTR [rdi+rdi]
shl rdi, 32
mov ecx, r10d
mov eax, r10d
xor ecx, 16
xor eax, 32
xor r10d, 48
aesenc xmm5, xmm6
movdqa xmm2, XMMWORD PTR [rcx+rbx]
movdqa xmm1, XMMWORD PTR [rax+rbx]
movdqa xmm0, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
paddq xmm0, xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm0
movdqa XMMWORD PTR [rax+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movaps xmm1, xmm8
mov rsi, r15
xor rsi, rdi
movq r14, xmm5
movdqa xmm0, xmm5
pxor xmm0, xmm3
mov r10, r14
and r10d, 2097136
movdqa XMMWORD PTR [rdx], xmm0
xor rsi, QWORD PTR [r10+rbx]
lea r12, QWORD PTR [r10+rbx]
mov r13, QWORD PTR [r10+rbx+8]
add r9d, r14d
or r9d, -2147483647
xor edx, edx
movdqa xmm0, xmm5
psrldq xmm0, 8
movq rax, xmm0
div r9
movq xmm0, rax
movq xmm1, rdx
punpckldq xmm0, xmm1
movq r15, xmm0
paddq xmm0, xmm5
movdqa xmm2, xmm0
psrlq xmm0, 12
paddq xmm0, xmm7
sqrtsd xmm1, xmm0
movq rdi, xmm1
test rdi, 524287
je sqrt_fixup_ryzen
shr rdi, 19
sqrt_fixup_ryzen_ret:
mov rax, rsi
mul r14
movq xmm1, rax
movq xmm0, rdx
punpcklqdq xmm0, xmm1
mov r9d, r10d
mov ecx, r10d
xor r9d, 16
xor ecx, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [rcx+rbx]
xor rdx, [rcx+rbx]
xor rax, [rcx+rbx+8]
movdqa xmm2, XMMWORD PTR [r9+rbx]
pxor xmm2, xmm0
paddq xmm4, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
movdqa XMMWORD PTR [r9+rbx], xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movdqa xmm4, xmm3
add r8, rdx
add r11, rax
mov QWORD PTR [r12], r8
xor r8, rsi
mov QWORD PTR [r12+8], r11
mov r10, r8
xor r11, r13
and r10d, 2097136
movdqa xmm3, xmm5
dec ebp
jne main_loop_ryzen
ldmxcsr DWORD PTR [rsp]
movaps xmm6, XMMWORD PTR [rsp+48]
lea r11, QWORD PTR [rsp+64]
mov rbx, QWORD PTR [r11+56]
mov rbp, QWORD PTR [r11+64]
mov rsi, QWORD PTR [r11+72]
movaps xmm8, XMMWORD PTR [r11-48]
movaps xmm7, XMMWORD PTR [rsp+32]
mov rsp, r11
pop r15
pop r14
pop r13
pop r12
pop rdi
jmp cnv2_main_loop_ryzen_endp
sqrt_fixup_ryzen:
movq r9, xmm2
dec rdi
mov edx, -1022
shl rdx, 32
mov rax, rdi
shr rdi, 19
shr rax, 20
mov rcx, rdi
sub rcx, rax
lea rcx, [rcx+rdx+1]
add rax, rdx
imul rcx, rax
sub rcx, r9
adc rdi, 0
jmp sqrt_fixup_ryzen_ret
cnv2_main_loop_ryzen_endp:

21
crypto/asm/win64/cnv2_main_loop.S

@ -0,0 +1,21 @@
#define ALIGN .align
.intel_syntax noprefix
.section .text
.global cnv2_mainloop_ivybridge_asm
.global cnv2_mainloop_ryzen_asm
.global cnv2_double_mainloop_sandybridge_asm
ALIGN 16
cnv2_mainloop_ivybridge_asm:
#include "../cnv2_main_loop_ivybridge.inc"
ret 0
ALIGN 16
cnv2_mainloop_ryzen_asm:
#include "../cnv2_main_loop_ryzen.inc"
ret 0
ALIGN 16
cnv2_double_mainloop_sandybridge_asm:
#include "../cnv2_double_main_loop_sandybridge.inc"
ret 0

2
donate.h

@ -24,6 +24,6 @@
#ifndef __DONATE_H__
#define __DONATE_H__
#define DONATE_LEVEL 5
#define DONATE_LEVEL 0
#endif /* __DONATE_H__ */

48
memory.c

@ -4,8 +4,7 @@
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
* Copyright 2016-2017 XMRig <support@xmrig.com>
*
* Copyright 2016-2018 XMRig <support@xmrig.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@ -24,33 +23,15 @@
#include <string.h>
#include "persistent_memory.h"
#include "algo/cryptonight/cryptonight.h"
#include "options.h"
static size_t offset = 0;
#ifndef XMRIG_NO_AEON
static void * create_persistent_ctx_lite(int thr_id) {
struct cryptonight_ctx *ctx = NULL;
if (!opt_double_hash) {
const size_t offset = MEMORY * (thr_id + 1);
ctx = (struct cryptonight_ctx *) &persistent_memory[offset + MEMORY_LITE];
ctx->memory = (uint8_t*) &persistent_memory[offset];
return ctx;
}
ctx = (struct cryptonight_ctx *) &persistent_memory[MEMORY - sizeof(struct cryptonight_ctx) * (thr_id + 1)];
ctx->memory = (uint8_t*) &persistent_memory[MEMORY * (thr_id + 1)];
return ctx;
}
#endif
static size_t offset = 0;
void * persistent_calloc(size_t num, size_t size) {
size += size % 16;
void *mem = &persistent_memory[offset];
offset += (num * size);
@ -60,17 +41,14 @@ void * persistent_calloc(size_t num, size_t size) {
}
void * create_persistent_ctx(int thr_id) {
# ifndef XMRIG_NO_AEON
if (opt_algo == ALGO_CRYPTONIGHT_LITE) {
return create_persistent_ctx_lite(thr_id);
}
# endif
void create_cryptonight_ctx(struct cryptonight_ctx **ctx, int thr_id)
{
const int ratio = (opt_double_hash && opt_algo == ALGO_CRYPTONIGHT) ? 2 : 1;
ctx[0] = persistent_calloc(1, sizeof(struct cryptonight_ctx));
ctx[0]->memory = &persistent_memory[MEMORY * (thr_id * ratio + 1)];
struct cryptonight_ctx *ctx = (struct cryptonight_ctx *) &persistent_memory[MEMORY - sizeof(struct cryptonight_ctx) * (thr_id + 1)];
const int ratio = opt_double_hash ? 2 : 1;
ctx->memory = (uint8_t*) &persistent_memory[MEMORY * (thr_id * ratio + 1)];
return ctx;
if (opt_double_hash) {
ctx[1] = persistent_calloc(1, sizeof(struct cryptonight_ctx));
ctx[1]->memory = ctx[0]->memory + (opt_algo == ALGO_CRYPTONIGHT ? MEMORY : MEMORY_LITE);
}
}

117
options.c

@ -38,7 +38,6 @@
int64_t opt_affinity = -1L;
int opt_n_threads = 0;
int opt_algo_variant = 0;
int opt_retries = 5;
int opt_retry_pause = 5;
int opt_donate_level = DONATE_LEVEL;
@ -55,13 +54,41 @@ char *opt_userpass = NULL;
char *opt_user = NULL;
char *opt_pass = NULL;
enum mining_algo opt_algo = ALGO_CRYPTONIGHT;
enum Algo opt_algo = ALGO_CRYPTONIGHT;
enum Variant opt_variant = VARIANT_AUTO;
enum AlgoVariant opt_av = AV_AUTO;
enum Assembly opt_assembly = ASM_AUTO;
struct AlgoData
{
const char *name;
const char *shortName;
enum Algo algo;
enum Variant variant;
};
static struct AlgoData const algorithms[] = {
{ "cryptonight", "cn", ALGO_CRYPTONIGHT, VARIANT_AUTO },
{ "cryptonight/0", "cn/0", ALGO_CRYPTONIGHT, VARIANT_0 },
{ "cryptonight/1", "cn/1", ALGO_CRYPTONIGHT, VARIANT_1 },
{ "cryptonight/2", "cn/2", ALGO_CRYPTONIGHT, VARIANT_2 },
# ifndef XMRIG_NO_AEON
{ "cryptonight-lite", "cn-lite", ALGO_CRYPTONIGHT_LITE, VARIANT_AUTO },
{ "cryptonight-light", "cn-light", ALGO_CRYPTONIGHT_LITE, VARIANT_AUTO },
{ "cryptonight-lite/0", "cn-lite/0", ALGO_CRYPTONIGHT_LITE, VARIANT_0 },
{ "cryptonight-lite/1", "cn-lite/1", ALGO_CRYPTONIGHT_LITE, VARIANT_1 },
# endif
};
static char const usage[] = "\
Usage: " APP_ID " [OPTIONS]\n\
Options:\n\
-a, --algo=ALGO cryptonight (default) or cryptonight-lite\n\
--variant=N cryptonight variant: 0-2\n\
-o, --url=URL URL of mining server\n\
-b, --backup-url=URL URL of backup mining server\n\
-O, --userpass=U:P username:password pair for mining server\n\
@ -110,25 +137,43 @@ static struct option const options[] = {
{ "user", 1, NULL, 'u' },
{ "userpass", 1, NULL, 'O' },
{ "version", 0, NULL, 'V' },
{ 0, 0, 0, 0 }
{ "variant", 1, NULL, 1021 },
{ "asm", 1, NULL, 1022 },
{ NULL, 0, NULL, 0 }
};
static const char *algo_names[] = {
[ALGO_CRYPTONIGHT] = "cryptonight",
"cryptonight",
# ifndef XMRIG_NO_AEON
[ALGO_CRYPTONIGHT_LITE] = "cryptonight-lite"
"cryptonight-lite"
# endif
};
static const char *variant_names[] = {
"auto",
"0",
"1",
"2",
};
static const char *asm_names[] = {
"none",
"auto",
"intel",
"ryzen"
};
#ifndef XMRIG_NO_AEON
static int get_cryptonight_lite_variant(int variant) {
if (variant <= AEON_AV0_AUTO || variant >= AEON_AV_MAX) {
return (cpu_info.flags & CPU_FLAG_AES) ? AEON_AV2_AESNI_DOUBLE : AEON_AV4_SOFT_AES_DOUBLE;
if (variant <= AV_AUTO || variant >= AV_MAX) {
return (cpu_info.flags & CPU_FLAG_AES) ? AV_DOUBLE : AV_DOUBLE_SOFT;
}
if (opt_safe && !(cpu_info.flags & CPU_FLAG_AES) && variant <= AEON_AV2_AESNI_DOUBLE) {
if (opt_safe && !(cpu_info.flags & CPU_FLAG_AES) && variant <= AV_DOUBLE) {
return variant + 2;
}
@ -144,11 +189,11 @@ static int get_algo_variant(int algo, int variant) {
}
# endif
if (variant <= XMR_AV0_AUTO || variant >= XMR_AV_MAX) {
return (cpu_info.flags & CPU_FLAG_AES) ? XMR_AV1_AESNI : XMR_AV3_SOFT_AES;
if (variant <= AV_AUTO || variant >= AV_MAX) {
return (cpu_info.flags & CPU_FLAG_AES) ? AV_SINGLE : AV_SINGLE_SOFT;
}
if (opt_safe && !(cpu_info.flags & CPU_FLAG_AES) && variant <= XMR_AV2_AESNI_DOUBLE) {
if (opt_safe && !(cpu_info.flags & CPU_FLAG_AES) && variant <= AV_DOUBLE) {
return variant + 2;
}
@ -167,18 +212,21 @@ static void parse_arg(int key, char *arg) {
switch (key)
{
case 'a':
for (int i = 0; i < ARRAY_SIZE(algo_names); i++) {
if (algo_names[i] && !strcmp(arg, algo_names[i])) {
opt_algo = i;
case 'a': /* --algo */
for (size_t i = 0; i < ARRAY_SIZE(algorithms); i++) {
if ((strcasecmp(arg, algorithms[i].name) == 0) || (strcasecmp(arg, algorithms[i].shortName) == 0)) {
opt_algo = algorithms[i].algo;
opt_variant = algorithms[i].variant;
break;
}
}
break;
# ifndef XMRIG_NO_AEON
if (i == ARRAY_SIZE(algo_names) - 1 && !strcmp(arg, "cryptonight-light")) {
opt_algo = i = ALGO_CRYPTONIGHT_LITE;
case 1022: /* --asm */
for (size_t i = 0; i < ARRAY_SIZE(asm_names); i++) {
if (strcasecmp(arg, asm_names[i]) == 0) {
opt_assembly = i;
}
# endif
}
break;
@ -300,11 +348,11 @@ static void parse_arg(int key, char *arg) {
case 'v': /* --av */
v = atoi(arg);
if (v < 0 || v > 1000) {
if (v <= AV_AUTO || v >= AV_MAX) {
show_usage_and_exit(1);
}
opt_algo_variant = v;
opt_av = v;
break;
case 1020: /* --cpu-affinity */
@ -322,12 +370,19 @@ static void parse_arg(int key, char *arg) {
break;
case 1003: /* --donate-level */
// v = atoi(arg);
// if (v < 1 || v > 99) {
// show_usage_and_exit(1);
// }
// opt_donate_level = v;
break;
case 1021: /* --variant */
v = atoi(arg);
if (v < 1 || v > 99) {
show_usage_and_exit(1);
if (v > VARIANT_AUTO && v < VARIANT_MAX) {
opt_variant = v;
}
opt_donate_level = v;
break;
case 1006: /* --nicehash */
@ -451,9 +506,9 @@ void parse_cmdline(int argc, char *argv[]) {
sprintf(opt_userpass, "%s:%s", opt_user, opt_pass);
}
opt_algo_variant = get_algo_variant(opt_algo, opt_algo_variant);
opt_av = get_algo_variant(opt_algo, opt_av);
if (!cryptonight_init(opt_algo_variant)) {
if (!cryptonight_init(opt_av)) {
applog(LOG_ERR, "Cryptonight hash self-test failed. This might be caused by bad compiler optimizations.");
proper_exit(1);
}
@ -511,6 +566,12 @@ void show_version_and_exit(void) {
}
const char* get_current_algo_name(void) {
const char *get_current_algo_name(void) {
return algo_names[opt_algo];
}
const char *get_current_variant_name(void)
{
return variant_names[opt_variant + 1];
}

57
options.h

@ -21,43 +21,50 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __OPTIONS_H__
#define __OPTIONS_H__
#ifndef XMRIG_OPTIONS_H
#define XMRIG_OPTIONS_H
#include <stdbool.h>
#include <stdint.h>
#ifndef ARRAY_SIZE
# define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
#endif
enum mining_algo {
enum Algo {
ALGO_CRYPTONIGHT, /* CryptoNight (Monero) */
ALGO_CRYPTONIGHT_LITE, /* CryptoNight-Lite (AEON) */
};
enum xmr_algo_variant {
XMR_AV0_AUTO,
XMR_AV1_AESNI,
XMR_AV2_AESNI_DOUBLE,
XMR_AV3_SOFT_AES,
XMR_AV4_SOFT_AES_DOUBLE,
XMR_AV_MAX
enum Variant {
VARIANT_AUTO = -1,
VARIANT_0 = 0,
VARIANT_1 = 1,
VARIANT_2 = 2,
VARIANT_MAX
};
#ifndef XMRIG_NO_AEON
enum aeon_algo_variant {
AEON_AV0_AUTO,
AEON_AV1_AESNI,
AEON_AV2_AESNI_DOUBLE,
AEON_AV3_SOFT_AES,
AEON_AV4_SOFT_AES_DOUBLE,
AEON_AV_MAX
enum AlgoVariant {
AV_AUTO, // --av=0 Automatic mode.
AV_SINGLE, // --av=1 Single hash mode
AV_DOUBLE, // --av=2 Double hash mode
AV_SINGLE_SOFT, // --av=3 Single hash mode (Software AES)
AV_DOUBLE_SOFT, // --av=4 Double hash mode (Software AES)
AV_MAX
};
enum Assembly {
ASM_NONE,
ASM_AUTO,
ASM_INTEL,
ASM_RYZEN,
ASM_MAX
};
#endif
extern bool opt_colors;
@ -72,20 +79,24 @@ extern char *opt_userpass;
extern char *opt_user;
extern char *opt_pass;
extern int opt_n_threads;
extern int opt_algo_variant;
extern int opt_retry_pause;
extern int opt_retries;
extern int opt_donate_level;
extern int opt_max_cpu_usage;
extern int64_t opt_affinity;
extern enum mining_algo opt_algo;
extern enum Algo opt_algo;
extern enum Variant opt_variant;
extern enum AlgoVariant opt_av;
extern enum Assembly opt_assembly;
void parse_cmdline(int argc, char *argv[]);
void show_usage_and_exit(int status);
void show_version_and_exit(void);
const char* get_current_algo_name(void);
const char *get_current_algo_name(void);
const char *get_current_variant_name(void);
extern void proper_exit(int reason);
#endif /* __OPTIONS_H__ */
#endif /* XMRIG_OPTIONS_H */

12
persistent_memory.h

@ -21,12 +21,16 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __PERSISTENT_MEMORY_H__
#define __PERSISTENT_MEMORY_H__
#ifndef XMRIG_PERSISTENT_MEMORY_H
#define XMRIG_PERSISTENT_MEMORY_H
#include <stddef.h>
#include "algo/cryptonight/cryptonight.h"
enum memory_flags {
MEMORY_HUGEPAGES_AVAILABLE = 1,
MEMORY_HUGEPAGES_ENABLED = 2,
@ -44,7 +48,7 @@ extern int persistent_memory_flags;
const char * persistent_memory_allocate();
void persistent_memory_free();
void * persistent_calloc(size_t num, size_t size);
void * create_persistent_ctx(int thr_id);
void create_cryptonight_ctx(struct cryptonight_ctx **ctx, int thr_id);
#endif /* __PERSISTENT_MEMORY_H__ */
#endif /* XMRIG_PERSISTENT_MEMORY_H */

32
stratum.c

@ -40,11 +40,12 @@
# include <netinet/in.h>
#endif
#include "stratum.h"
#include "version.h"
#include "options.h"
#include "stats.h"
#include "stratum.h"
#include "util.h"
#include "utils/applog.h"
#include "version.h"
#ifdef WIN32
@ -73,6 +74,7 @@ static int sockopt_keepalive_cb(void *userdata, curl_socket_t fd, curlsocktype p
static curl_socket_t opensocket_grab_cb(void *clientp, curlsocktype purpose, struct curl_sockaddr *addr);
static int closesocket_cb(void *clientp, curl_socket_t item);
static bool login_decode(struct stratum_ctx *sctx, const json_t *val);
static void extensions_decode(const json_t *val);
static bool job_decode(const json_t *job);
static bool jobj_binary(const json_t *obj, const char *key, void *buf, size_t buflen);
@ -625,6 +627,11 @@ static bool login_decode(struct stratum_ctx *sctx, const json_t *val) {
memcpy(&sctx->id, id, strlen(id));
const char *s = json_string_value(json_object_get(res, "status"));
if (!s) {
// Workaround for xmrig-proxy bug https://github.com/xmrig/xmrig-proxy/commit/dfa1960fe3eeb13f80717b7dbfcc7c6e9f222d89
s = json_string_value(json_object_get(val, "status"));
}
if (!s) {
applog(LOG_ERR, "JSON invalid status");
return false;
@ -635,10 +642,31 @@ static bool login_decode(struct stratum_ctx *sctx, const json_t *val) {
return false;
}
extensions_decode(res);
return true;
}
static void extensions_decode(const json_t *res)
{
json_t *extensions = json_object_get(res, "extensions");
if (!extensions || json_array_size(extensions) == 0) {
return;
}
size_t index;
json_t *value;
json_array_foreach(extensions, index, value) {
const char *s = json_string_value(value);
if (s && strcmp(s, "nicehash")) {
opt_nicehash = true;
}
}
}
/**
* @brief job_decode
* @param sctx

7
stratum.h

@ -21,8 +21,9 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __STRATUM_H__
#define __STRATUM_H__
#ifndef XMRIG_STRATUM_H
#define XMRIG_STRATUM_H
#include <stdbool.h>
#include <inttypes.h>
@ -75,4 +76,4 @@ bool stratum_handle_method(struct stratum_ctx *sctx, const char *s);
bool stratum_handle_response(char *buf);
bool stratum_keepalived(struct stratum_ctx *sctx);
#endif /* __STRATUM_H__ */
#endif /* XMRIG_STRATUM_H */

4
utils/summary.c

@ -77,10 +77,10 @@ static void print_threads() {
}
if (opt_colors) {
applog_notime(LOG_INFO, CL_LGR " * " CL_WHT "THREADS: " CL_WHT "%d" CL_WHT ", av=%d, %s, donate=%d%%%s", opt_n_threads, opt_algo_variant, get_current_algo_name(), opt_donate_level, extra);
applog_notime(LOG_INFO, CL_LGR " * " CL_WHT "THREADS: " CL_WHT "%d" CL_WHT ", av=%d, %s/%s, donate=%d%%%s", opt_n_threads, opt_av, get_current_algo_name(), get_current_variant_name(), opt_donate_level, extra);
}
else {
applog_notime(LOG_INFO, " * THREADS: %d, av=%d, %s, donate=%d%%%s", opt_n_threads, opt_algo_variant, get_current_algo_name(), opt_donate_level, extra);
applog_notime(LOG_INFO, " * THREADS: %d, av=%d, %s/%s, donate=%d%%%s", opt_n_threads, opt_av, get_current_algo_name(), get_current_variant_name(), opt_donate_level, extra);
}
}

12
version.h

@ -21,20 +21,20 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __VERSION_H__
#define __VERSION_H__
#ifndef XMRIG_VERSION_H
#define XMRIG_VERSION_H
#define APP_ID "xmrig"
#define APP_NAME "XMRig"
#define APP_DESC "Monero (XMR) CPU miner"
#define APP_VERSION "0.8.3"
#define APP_VERSION "0.9.0-dev"
#define APP_DOMAIN "xmrig.com"
#define APP_SITE "www.xmrig.com"
#define APP_COPYRIGHT "Copyright (C) 2016-2018 xmrig.com"
#define APP_VER_MAJOR 0
#define APP_VER_MINOR 8
#define APP_VER_BUILD 3
#define APP_VER_MINOR 9
#define APP_VER_BUILD 0
#define APP_VER_REV 0
#endif /* __VERSION_H__ */
#endif /* XMRIG_VERSION_H */

8
xmrig.c

@ -260,7 +260,8 @@ static void *miner_thread(void *userdata) {
uint32_t max_nonce;
uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 0x20;
struct cryptonight_ctx *persistentctx = (struct cryptonight_ctx *) create_persistent_ctx(thr_id);
struct cryptonight_ctx *persistentctx[1];
create_cryptonight_ctx(persistentctx, thr_id);
if (cpu_info.total_logical_cpus > 1 && opt_affinity != -1L) {
affine_to_cpu_mask(thr_id, (unsigned long) opt_affinity);
@ -306,7 +307,7 @@ static void *miner_thread(void *userdata) {
gettimeofday(&tv_start, NULL);
/* scan nonces for a proof-of-work hash */
const int rc = scanhash_cryptonight(thr_id, hash, work.blob, work.blob_size, work.target, max_nonce, &hashes_done, persistentctx);
const int rc = scanhash_cryptonight(thr_id, hash, (const uint8_t *) work.blob, work.blob_size, work.target, max_nonce, &hashes_done, persistentctx);
stats_add_hashes(thr_id, &tv_start, hashes_done);
if (!rc) {
@ -335,7 +336,8 @@ static void *miner_thread_double(void *userdata) {
uint32_t max_nonce;
uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 0x20;
struct cryptonight_ctx *persistentctx = (struct cryptonight_ctx *) create_persistent_ctx(thr_id);
struct cryptonight_ctx *persistentctx[2];
create_cryptonight_ctx(persistentctx, thr_id);
if (cpu_info.total_logical_cpus > 1 && opt_affinity != -1L) {
affine_to_cpu_mask(thr_id, (unsigned long) opt_affinity);

Loading…
Cancel
Save