|
|
@ -31,12 +31,6 @@ |
|
|
|
#include <limits> |
|
|
|
|
|
|
|
|
|
|
|
constexpr int STAGE1_SIZE = 147253; |
|
|
|
constexpr int ALLOCATION_SIZE = (STAGE1_SIZE + 1048576) + (128 - (STAGE1_SIZE & 63)); |
|
|
|
|
|
|
|
constexpr int COUNTING_SORT_BITS = 10; |
|
|
|
constexpr int COUNTING_SORT_SIZE = 1 << COUNTING_SORT_BITS; |
|
|
|
|
|
|
|
static bool astrobwtInitialized = false; |
|
|
|
|
|
|
|
#ifdef ASTROBWT_AVX2 |
|
|
@ -87,353 +81,6 @@ static void Salsa20_XORKeyStream_AVX256(const void* key, void* output, size_t si |
|
|
|
} |
|
|
|
#endif |
|
|
|
|
|
|
|
static inline bool smaller(const uint8_t* v, uint64_t a, uint64_t b) |
|
|
|
{ |
|
|
|
const uint64_t value_a = a >> 21; |
|
|
|
const uint64_t value_b = b >> 21; |
|
|
|
|
|
|
|
if (value_a < value_b) { |
|
|
|
return true; |
|
|
|
} |
|
|
|
|
|
|
|
if (value_a > value_b) { |
|
|
|
return false; |
|
|
|
} |
|
|
|
|
|
|
|
a &= (1 << 21) - 1; |
|
|
|
b &= (1 << 21) - 1; |
|
|
|
|
|
|
|
if (a == b) { |
|
|
|
return false; |
|
|
|
} |
|
|
|
|
|
|
|
const uint64_t data_a = bswap_64(*reinterpret_cast<const uint64_t*>(v + a + 5)); |
|
|
|
const uint64_t data_b = bswap_64(*reinterpret_cast<const uint64_t*>(v + b + 5)); |
|
|
|
return (data_a < data_b); |
|
|
|
} |
|
|
|
|
|
|
|
void sort_indices(uint32_t N, const uint8_t* v, uint64_t* indices, uint64_t* tmp_indices) |
|
|
|
{ |
|
|
|
uint32_t counters[2][COUNTING_SORT_SIZE] = {}; |
|
|
|
|
|
|
|
{ |
|
|
|
#define ITER(X) \ |
|
|
|
do { \ |
|
|
|
const uint64_t k = bswap_64(*reinterpret_cast<const uint64_t*>(v + i + X)); \ |
|
|
|
++counters[0][(k >> (64 - COUNTING_SORT_BITS * 2)) & (COUNTING_SORT_SIZE - 1)]; \ |
|
|
|
++counters[1][k >> (64 - COUNTING_SORT_BITS)]; \ |
|
|
|
} while (0) |
|
|
|
|
|
|
|
uint32_t i = 0; |
|
|
|
const uint32_t n = N - 15; |
|
|
|
for (; i < n; i += 16) { |
|
|
|
ITER(0); ITER(1); ITER(2); ITER(3); ITER(4); ITER(5); ITER(6); ITER(7); |
|
|
|
ITER(8); ITER(9); ITER(10); ITER(11); ITER(12); ITER(13); ITER(14); ITER(15); |
|
|
|
} |
|
|
|
for (; i < N; ++i) { |
|
|
|
ITER(0); |
|
|
|
} |
|
|
|
|
|
|
|
#undef ITER |
|
|
|
} |
|
|
|
|
|
|
|
uint32_t prev[2] = { counters[0][0], counters[1][0] }; |
|
|
|
counters[0][0] = prev[0] - 1; |
|
|
|
counters[1][0] = prev[1] - 1; |
|
|
|
for (int i = 1; i < COUNTING_SORT_SIZE; ++i) |
|
|
|
{ |
|
|
|
const uint32_t cur[2] = { counters[0][i] + prev[0], counters[1][i] + prev[1] }; |
|
|
|
counters[0][i] = cur[0] - 1; |
|
|
|
counters[1][i] = cur[1] - 1; |
|
|
|
prev[0] = cur[0]; |
|
|
|
prev[1] = cur[1]; |
|
|
|
} |
|
|
|
|
|
|
|
{ |
|
|
|
#define ITER(X) \ |
|
|
|
do { \ |
|
|
|
const uint64_t k = bswap_64(*reinterpret_cast<const uint64_t*>(v + (i - X))); \ |
|
|
|
tmp_indices[counters[0][(k >> (64 - COUNTING_SORT_BITS * 2)) & (COUNTING_SORT_SIZE - 1)]--] = (k & (static_cast<uint64_t>(-1) << 21)) | (i - X); \ |
|
|
|
} while (0) |
|
|
|
|
|
|
|
uint32_t i = N; |
|
|
|
for (; i >= 8; i -= 8) { |
|
|
|
ITER(1); ITER(2); ITER(3); ITER(4); ITER(5); ITER(6); ITER(7); ITER(8); |
|
|
|
} |
|
|
|
for (; i > 0; --i) { |
|
|
|
ITER(1); |
|
|
|
} |
|
|
|
|
|
|
|
#undef ITER |
|
|
|
} |
|
|
|
|
|
|
|
{ |
|
|
|
#define ITER(X) \ |
|
|
|
do { \ |
|
|
|
const uint64_t data = tmp_indices[i - X]; \ |
|
|
|
indices[counters[1][data >> (64 - COUNTING_SORT_BITS)]--] = data; \ |
|
|
|
} while (0) |
|
|
|
|
|
|
|
uint32_t i = N; |
|
|
|
for (; i >= 8; i -= 8) { |
|
|
|
ITER(1); ITER(2); ITER(3); ITER(4); ITER(5); ITER(6); ITER(7); ITER(8); |
|
|
|
} |
|
|
|
for (; i > 0; --i) { |
|
|
|
ITER(1); |
|
|
|
} |
|
|
|
|
|
|
|
#undef ITER |
|
|
|
} |
|
|
|
|
|
|
|
uint64_t prev_t = indices[0]; |
|
|
|
for (uint32_t i = 1; i < N; ++i) |
|
|
|
{ |
|
|
|
uint64_t t = indices[i]; |
|
|
|
if (smaller(v, t, prev_t)) |
|
|
|
{ |
|
|
|
const uint64_t t2 = prev_t; |
|
|
|
int j = i - 1; |
|
|
|
do |
|
|
|
{ |
|
|
|
indices[j + 1] = prev_t; |
|
|
|
--j; |
|
|
|
|
|
|
|
if (j < 0) { |
|
|
|
break; |
|
|
|
} |
|
|
|
|
|
|
|
prev_t = indices[j]; |
|
|
|
} while (smaller(v, t, prev_t)); |
|
|
|
indices[j + 1] = t; |
|
|
|
t = t2; |
|
|
|
} |
|
|
|
prev_t = t; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
void sort_indices2(uint32_t N, const uint8_t* v, uint64_t* indices, uint64_t* tmp_indices) |
|
|
|
{ |
|
|
|
alignas(16) uint32_t counters[1 << COUNTING_SORT_BITS] = {}; |
|
|
|
alignas(16) uint32_t counters2[1 << COUNTING_SORT_BITS]; |
|
|
|
|
|
|
|
{ |
|
|
|
#define ITER(X) { \ |
|
|
|
const uint64_t k = bswap_64(*reinterpret_cast<const uint64_t*>(v + i + X)); \ |
|
|
|
++counters[k >> (64 - COUNTING_SORT_BITS)]; \ |
|
|
|
} |
|
|
|
|
|
|
|
uint32_t i = 0; |
|
|
|
const uint32_t n = (N / 32) * 32; |
|
|
|
for (; i < n; i += 32) { |
|
|
|
ITER(0); ITER(1); ITER(2); ITER(3); ITER(4); ITER(5); ITER(6); ITER(7); |
|
|
|
ITER(8); ITER(9); ITER(10); ITER(11); ITER(12); ITER(13); ITER(14); ITER(15); |
|
|
|
ITER(16); ITER(17); ITER(18); ITER(19); ITER(20); ITER(21); ITER(22); ITER(23); |
|
|
|
ITER(24); ITER(25); ITER(26); ITER(27); ITER(28); ITER(29); ITER(30); ITER(31); |
|
|
|
} |
|
|
|
for (; i < N; ++i) { |
|
|
|
ITER(0); |
|
|
|
} |
|
|
|
|
|
|
|
#undef ITER |
|
|
|
} |
|
|
|
|
|
|
|
uint32_t prev = static_cast<uint32_t>(-1); |
|
|
|
for (uint32_t i = 0; i < (1 << COUNTING_SORT_BITS); i += 16) |
|
|
|
{ |
|
|
|
#define ITER(X) { \ |
|
|
|
const uint32_t cur = counters[i + X] + prev; \ |
|
|
|
counters[i + X] = cur; \ |
|
|
|
counters2[i + X] = cur; \ |
|
|
|
prev = cur; \ |
|
|
|
} |
|
|
|
ITER(0); ITER(1); ITER(2); ITER(3); ITER(4); ITER(5); ITER(6); ITER(7); |
|
|
|
ITER(8); ITER(9); ITER(10); ITER(11); ITER(12); ITER(13); ITER(14); ITER(15); |
|
|
|
#undef ITER |
|
|
|
} |
|
|
|
|
|
|
|
{ |
|
|
|
#define ITER(X) \ |
|
|
|
do { \ |
|
|
|
const uint64_t k = bswap_64(*reinterpret_cast<const uint64_t*>(v + (i - X))); \ |
|
|
|
indices[counters[k >> (64 - COUNTING_SORT_BITS)]--] = (k & (static_cast<uint64_t>(-1) << 21)) | (i - X); \ |
|
|
|
} while (0) |
|
|
|
|
|
|
|
uint32_t i = N; |
|
|
|
for (; i >= 8; i -= 8) { |
|
|
|
ITER(1); ITER(2); ITER(3); ITER(4); ITER(5); ITER(6); ITER(7); ITER(8); |
|
|
|
} |
|
|
|
for (; i > 0; --i) { |
|
|
|
ITER(1); |
|
|
|
} |
|
|
|
|
|
|
|
#undef ITER |
|
|
|
} |
|
|
|
|
|
|
|
uint32_t prev_i = 0; |
|
|
|
for (uint32_t i0 = 0; i0 < (1 << COUNTING_SORT_BITS); ++i0) { |
|
|
|
const uint32_t i = counters2[i0] + 1; |
|
|
|
const uint32_t n = i - prev_i; |
|
|
|
if (n > 1) { |
|
|
|
memset(counters, 0, sizeof(uint32_t) * (1 << COUNTING_SORT_BITS)); |
|
|
|
|
|
|
|
const uint32_t n8 = (n / 8) * 8; |
|
|
|
uint32_t j = 0; |
|
|
|
|
|
|
|
#define ITER(X) { \ |
|
|
|
const uint64_t k = indices[prev_i + j + X]; \ |
|
|
|
++counters[(k >> (64 - COUNTING_SORT_BITS * 2)) & ((1 << COUNTING_SORT_BITS) - 1)]; \ |
|
|
|
tmp_indices[j + X] = k; \ |
|
|
|
} |
|
|
|
for (; j < n8; j += 8) { |
|
|
|
ITER(0); ITER(1); ITER(2); ITER(3); ITER(4); ITER(5); ITER(6); ITER(7); |
|
|
|
} |
|
|
|
for (; j < n; ++j) { |
|
|
|
ITER(0); |
|
|
|
} |
|
|
|
#undef ITER |
|
|
|
|
|
|
|
uint32_t prev = static_cast<uint32_t>(-1); |
|
|
|
for (uint32_t j = 0; j < (1 << COUNTING_SORT_BITS); j += 32) |
|
|
|
{ |
|
|
|
#define ITER(X) { \ |
|
|
|
const uint32_t cur = counters[j + X] + prev; \ |
|
|
|
counters[j + X] = cur; \ |
|
|
|
prev = cur; \ |
|
|
|
} |
|
|
|
ITER(0); ITER(1); ITER(2); ITER(3); ITER(4); ITER(5); ITER(6); ITER(7); |
|
|
|
ITER(8); ITER(9); ITER(10); ITER(11); ITER(12); ITER(13); ITER(14); ITER(15); |
|
|
|
ITER(16); ITER(17); ITER(18); ITER(19); ITER(20); ITER(21); ITER(22); ITER(23); |
|
|
|
ITER(24); ITER(25); ITER(26); ITER(27); ITER(28); ITER(29); ITER(30); ITER(31); |
|
|
|
#undef ITER |
|
|
|
} |
|
|
|
|
|
|
|
#define ITER(X) { \ |
|
|
|
const uint64_t k = tmp_indices[j - X]; \ |
|
|
|
const uint32_t index = counters[(k >> (64 - COUNTING_SORT_BITS * 2)) & ((1 << COUNTING_SORT_BITS) - 1)]--; \ |
|
|
|
indices[prev_i + index] = k; \ |
|
|
|
} |
|
|
|
for (j = n; j >= 8; j -= 8) { |
|
|
|
ITER(1); ITER(2); ITER(3); ITER(4); ITER(5); ITER(6); ITER(7); ITER(8); |
|
|
|
} |
|
|
|
for (; j > 0; --j) { |
|
|
|
ITER(1); |
|
|
|
} |
|
|
|
#undef ITER |
|
|
|
|
|
|
|
uint64_t prev_t = indices[prev_i]; |
|
|
|
for (uint64_t* p = indices + prev_i + 1, *e = indices + i; p != e; ++p) |
|
|
|
{ |
|
|
|
uint64_t t = *p; |
|
|
|
if (smaller(v, t, prev_t)) |
|
|
|
{ |
|
|
|
const uint64_t t2 = prev_t; |
|
|
|
uint64_t* p1 = p; |
|
|
|
do |
|
|
|
{ |
|
|
|
*p1 = prev_t; |
|
|
|
--p1; |
|
|
|
|
|
|
|
if (p1 <= indices + prev_i) { |
|
|
|
break; |
|
|
|
} |
|
|
|
|
|
|
|
prev_t = *(p1 - 1); |
|
|
|
} while (smaller(v, t, prev_t)); |
|
|
|
*p1 = t; |
|
|
|
t = t2; |
|
|
|
} |
|
|
|
prev_t = t; |
|
|
|
} |
|
|
|
} |
|
|
|
prev_i = i; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
bool xmrig::astrobwt::astrobwt_dero(const void* input_data, uint32_t input_size, void* scratchpad, uint8_t* output_hash, int stage2_max_size, bool avx2) |
|
|
|
{ |
|
|
|
alignas(8) uint8_t key[32]; |
|
|
|
uint8_t* scratchpad_ptr = (uint8_t*)(scratchpad) + 64; |
|
|
|
uint8_t* stage1_output = scratchpad_ptr; |
|
|
|
uint8_t* stage2_output = scratchpad_ptr; |
|
|
|
uint64_t* indices = (uint64_t*)(scratchpad_ptr + ALLOCATION_SIZE); |
|
|
|
uint64_t* tmp_indices = (uint64_t*)(scratchpad_ptr + ALLOCATION_SIZE * 9); |
|
|
|
uint8_t* stage1_result = (uint8_t*)(tmp_indices); |
|
|
|
uint8_t* stage2_result = (uint8_t*)(tmp_indices); |
|
|
|
|
|
|
|
#ifdef ASTROBWT_AVX2 |
|
|
|
if (hasAVX2 && avx2) { |
|
|
|
SHA3_256_AVX2_ASM(input_data, input_size, key); |
|
|
|
Salsa20_XORKeyStream_AVX256(key, stage1_output, STAGE1_SIZE); |
|
|
|
} |
|
|
|
else |
|
|
|
#endif |
|
|
|
{ |
|
|
|
sha3_HashBuffer(256, SHA3_FLAGS_NONE, input_data, input_size, key, sizeof(key)); |
|
|
|
Salsa20_XORKeyStream(key, stage1_output, STAGE1_SIZE); |
|
|
|
} |
|
|
|
|
|
|
|
sort_indices(STAGE1_SIZE + 1, stage1_output, indices, tmp_indices); |
|
|
|
|
|
|
|
{ |
|
|
|
const uint8_t* tmp = stage1_output - 1; |
|
|
|
for (int i = 0; i <= STAGE1_SIZE; ++i) { |
|
|
|
stage1_result[i] = tmp[indices[i] & ((1 << 21) - 1)]; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
#ifdef ASTROBWT_AVX2 |
|
|
|
if (hasAVX2 && avx2) |
|
|
|
SHA3_256_AVX2_ASM(stage1_result, STAGE1_SIZE + 1, key); |
|
|
|
else |
|
|
|
#endif |
|
|
|
sha3_HashBuffer(256, SHA3_FLAGS_NONE, stage1_result, STAGE1_SIZE + 1, key, sizeof(key)); |
|
|
|
|
|
|
|
const int stage2_size = STAGE1_SIZE + (*(uint32_t*)(key) & 0xfffff); |
|
|
|
if (stage2_size > stage2_max_size) { |
|
|
|
return false; |
|
|
|
} |
|
|
|
|
|
|
|
#ifdef ASTROBWT_AVX2 |
|
|
|
if (hasAVX2 && avx2) { |
|
|
|
Salsa20_XORKeyStream_AVX256(key, stage2_output, stage2_size); |
|
|
|
} |
|
|
|
else |
|
|
|
#endif |
|
|
|
{ |
|
|
|
Salsa20_XORKeyStream(key, stage2_output, stage2_size); |
|
|
|
} |
|
|
|
|
|
|
|
sort_indices2(stage2_size + 1, stage2_output, indices, tmp_indices); |
|
|
|
|
|
|
|
{ |
|
|
|
const uint8_t* tmp = stage2_output - 1; |
|
|
|
int i = 0; |
|
|
|
const int n = ((stage2_size + 1) / 4) * 4; |
|
|
|
|
|
|
|
for (; i < n; i += 4) |
|
|
|
{ |
|
|
|
stage2_result[i + 0] = tmp[indices[i + 0] & ((1 << 21) - 1)]; |
|
|
|
stage2_result[i + 1] = tmp[indices[i + 1] & ((1 << 21) - 1)]; |
|
|
|
stage2_result[i + 2] = tmp[indices[i + 2] & ((1 << 21) - 1)]; |
|
|
|
stage2_result[i + 3] = tmp[indices[i + 3] & ((1 << 21) - 1)]; |
|
|
|
} |
|
|
|
|
|
|
|
for (; i <= stage2_size; ++i) { |
|
|
|
stage2_result[i] = tmp[indices[i] & ((1 << 21) - 1)]; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
#ifdef ASTROBWT_AVX2 |
|
|
|
if (hasAVX2 && avx2) |
|
|
|
SHA3_256_AVX2_ASM(stage2_result, stage2_size + 1, output_hash); |
|
|
|
else |
|
|
|
#endif |
|
|
|
sha3_HashBuffer(256, SHA3_FLAGS_NONE, stage2_result, stage2_size + 1, output_hash, 32); |
|
|
|
|
|
|
|
return true; |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
bool xmrig::astrobwt::astrobwt_dero_v2(const void* input_data, uint32_t input_size, void* scratchpad, uint8_t* output_hash) |
|
|
|
{ |
|
|
|
constexpr size_t N = 9973; |
|
|
@ -485,13 +132,6 @@ void xmrig::astrobwt::init() |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
template<> |
|
|
|
void xmrig::astrobwt::single_hash<xmrig::Algorithm::ASTROBWT_DERO>(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx** ctx, uint64_t) |
|
|
|
{ |
|
|
|
astrobwt_dero(input, static_cast<uint32_t>(size), ctx[0]->memory, output, std::numeric_limits<int>::max(), true); |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
template<> |
|
|
|
void xmrig::astrobwt::single_hash<xmrig::Algorithm::ASTROBWT_DERO_2>(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx** ctx, uint64_t) |
|
|
|
{ |
|
|
|