Browse Source

RandomX: added performance profiler (for developers)

Also optimized Blake2b SSE4.1 code size to avoid code cache pollution.
pull/1830/head
SChernykh 4 years ago
parent
commit
a05393727c
  1. 1
      CMakeLists.txt
  2. 12
      src/base/base.cmake
  3. 10
      src/base/io/log/Tags.cpp
  4. 4
      src/base/io/log/Tags.h
  5. 100
      src/base/tools/Profiler.cpp
  6. 132
      src/base/tools/Profiler.h
  7. 43
      src/core/Miner.cpp
  8. 3
      src/crypto/randomx/aes_hash.cpp
  9. 2
      src/crypto/randomx/blake2/blake2.h
  10. 402
      src/crypto/randomx/blake2/blake2b-load-sse41.h
  11. 14
      src/crypto/randomx/blake2/blake2b-round.h
  12. 98
      src/crypto/randomx/blake2/blake2b.c
  13. 2
      src/crypto/randomx/blake2_generator.cpp
  14. 3
      src/crypto/randomx/jit_compiler_x86.cpp
  15. 18
      src/crypto/randomx/randomx.cpp
  16. 10
      src/crypto/randomx/virtual_machine.cpp
  17. 8
      src/crypto/randomx/virtual_machine.hpp
  18. 5
      src/crypto/randomx/vm_compiled.cpp
  19. 4
      src/crypto/rx/RxConfig.cpp

1
CMakeLists.txt

@ -23,6 +23,7 @@ option(WITH_NVML "Enable NVML (NVIDIA Management Library) support (on
option(WITH_ADL "Enable ADL (AMD Display Library) or sysfs support (only if OpenCL backend enabled)" ON)
option(WITH_STRICT_CACHE "Enable strict checks for OpenCL cache" ON)
option(WITH_INTERLEAVE_DEBUG_LOG "Enable debug log for threads interleave" OFF)
option(WITH_PROFILING "Enable profiling for developers" OFF)
option(BUILD_STATIC "Build static binary" OFF)
option(ARM_TARGET "Force use specific ARM target 8 or 7" 0)

12
src/base/base.cmake

@ -222,3 +222,15 @@ if (WITH_KAWPOW)
src/base/net/stratum/EthStratumClient.cpp
)
endif()
if (WITH_PROFILING)
add_definitions(/DXMRIG_FEATURE_PROFILING)
list(APPEND HEADERS_BASE
src/base/tools/Profiler.h
)
list(APPEND SOURCES_BASE
src/base/tools/Profiler.cpp
)
endif()

10
src/base/io/log/Tags.cpp

@ -101,3 +101,13 @@ const char *xmrig::Tags::opencl()
return tag;
}
#endif
#ifdef XMRIG_FEATURE_PROFILING
const char* xmrig::Tags::profiler()
{
static const char* tag = CYAN_BG_BOLD(WHITE_BOLD_S " profile ");
return tag;
}
#endif

4
src/base/io/log/Tags.h

@ -53,6 +53,10 @@ public:
# ifdef XMRIG_FEATURE_OPENCL
static const char *opencl();
# endif
# ifdef XMRIG_FEATURE_PROFILING
static const char* profiler();
# endif
};

100
src/base/tools/Profiler.cpp

@ -0,0 +1,100 @@
/* XMRig
* Copyright 2018-2020 SChernykh <https://github.com/SChernykh>
* Copyright 2016-2020 XMRig <https://github.com/xmrig>, <support@xmrig.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "base/tools/Profiler.h"
#include "base/io/log/Log.h"
#include "base/io/log/Tags.h"
#include <sstream>
#include <thread>
#include <chrono>
#include <algorithm>
#ifdef XMRIG_FEATURE_PROFILING
ProfileScopeData* ProfileScopeData::s_data[MAX_DATA_COUNT] = {};
volatile long ProfileScopeData::s_dataCount = 0;
double ProfileScopeData::s_tscSpeed = 0.0;
#ifndef NOINLINE
#ifdef __GNUC__
#define NOINLINE __attribute__ ((noinline))
#elif _MSC_VER
#define NOINLINE __declspec(noinline)
#else
#define NOINLINE
#endif
#endif
static std::string get_thread_id()
{
std::stringstream ss;
ss << std::this_thread::get_id();
std::string s = ss.str();
if (s.length() > ProfileScopeData::MAX_THREAD_ID_LENGTH) {
s.resize(ProfileScopeData::MAX_THREAD_ID_LENGTH);
}
return s;
}
NOINLINE void ProfileScopeData::Register(ProfileScopeData* data)
{
#ifdef _MSC_VER
const long id = _InterlockedIncrement(&s_dataCount) - 1;
#else
const long id = __sync_fetch_and_add(&s_dataCount, 1);
#endif
if (static_cast<unsigned long>(id) < MAX_DATA_COUNT) {
s_data[id] = data;
const std::string s = get_thread_id();
memcpy(data->m_threadId, s.c_str(), s.length() + 1);
}
}
NOINLINE void ProfileScopeData::Init()
{
using namespace std::chrono;
const uint64_t t1 = static_cast<uint64_t>(time_point_cast<nanoseconds>(high_resolution_clock::now()).time_since_epoch().count());
const uint64_t count1 = ReadTSC();
for (;;)
{
const uint64_t t2 = static_cast<uint64_t>(time_point_cast<nanoseconds>(high_resolution_clock::now()).time_since_epoch().count());
const uint64_t count2 = ReadTSC();
if (t2 - t1 > 1000000000) {
s_tscSpeed = (count2 - count1) * 1e9 / (t2 - t1);
LOG_INFO("%s TSC speed = %.3f GHz", xmrig::Tags::profiler(), s_tscSpeed / 1e9);
return;
}
}
}
#endif /* XMRIG_FEATURE_PROFILING */

132
src/base/tools/Profiler.h

@ -0,0 +1,132 @@
/* XMRig
* Copyright 2018-2020 SChernykh <https://github.com/SChernykh>
* Copyright 2016-2020 XMRig <https://github.com/xmrig>, <support@xmrig.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef XMRIG_PROFILER_H
#define XMRIG_PROFILER_H
#ifndef FORCE_INLINE
#if defined(_MSC_VER)
#define FORCE_INLINE __forceinline
#elif defined(__GNUC__)
#define FORCE_INLINE __attribute__((always_inline)) inline
#elif defined(__clang__)
#define FORCE_INLINE __inline__
#else
#define FORCE_INLINE
#endif
#endif
#ifdef XMRIG_FEATURE_PROFILING
#include <cstdint>
#include <type_traits>
#if defined(_MSC_VER)
#include <intrin.h>
#endif
static FORCE_INLINE uint64_t ReadTSC()
{
#ifdef _MSC_VER
return __rdtsc();
#else
uint32_t hi, lo;
__asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
return (((uint64_t)hi) << 32) | lo;
#endif
}
struct ProfileScopeData
{
const char* m_name;
uint64_t m_totalCycles;
uint32_t m_totalSamples;
enum
{
MAX_THREAD_ID_LENGTH = 11,
MAX_SAMPLE_COUNT = 128,
MAX_DATA_COUNT = 1024
};
char m_threadId[MAX_THREAD_ID_LENGTH + 1];
static ProfileScopeData* s_data[MAX_DATA_COUNT];
static volatile long s_dataCount;
static double s_tscSpeed;
static void Register(ProfileScopeData* data);
static void Init();
};
static_assert(std::is_trivial<ProfileScopeData>::value, "ProfileScopeData must be a trivial struct");
static_assert(sizeof(ProfileScopeData) <= 32, "ProfileScopeData struct is too big");
class ProfileScope
{
public:
FORCE_INLINE ProfileScope(ProfileScopeData& data)
: m_data(data)
{
if (m_data.m_totalCycles == 0) {
ProfileScopeData::Register(&data);
}
m_startCounter = ReadTSC();
}
FORCE_INLINE ~ProfileScope()
{
m_data.m_totalCycles += ReadTSC() - m_startCounter;
++m_data.m_totalSamples;
}
private:
ProfileScopeData& m_data;
uint64_t m_startCounter;
};
#define PROFILE_SCOPE(x) static thread_local ProfileScopeData x##_data{#x}; ProfileScope x(x##_data);
#else /* XMRIG_FEATURE_PROFILING */
#define PROFILE_SCOPE(x)
#endif /* XMRIG_FEATURE_PROFILING */
#include "crypto/randomx/blake2/blake2.h"
struct rx_blake2b_wrapper
{
FORCE_INLINE static void run(void* out, size_t outlen, const void* in, size_t inlen)
{
PROFILE_SCOPE(RandomX_Blake2b);
rx_blake2b(out, outlen, in, inlen);
}
};
#endif /* XMRIG_PROFILER_H */

43
src/core/Miner.cpp

@ -38,6 +38,7 @@
#include "base/kernel/Platform.h"
#include "base/net/stratum/Job.h"
#include "base/tools/Object.h"
#include "base/tools/Profiler.h"
#include "base/tools/Timer.h"
#include "core/config/Config.h"
#include "core/Controller.h"
@ -267,6 +268,44 @@ public:
h = "MH/s";
}
# ifdef XMRIG_FEATURE_PROFILING
ProfileScopeData* data[ProfileScopeData::MAX_DATA_COUNT];
const uint32_t n = std::min<uint32_t>(ProfileScopeData::s_dataCount, ProfileScopeData::MAX_DATA_COUNT);
memcpy(data, ProfileScopeData::s_data, n * sizeof(ProfileScopeData*));
std::sort(data, data + n, [](ProfileScopeData* a, ProfileScopeData* b) {
return strcmp(a->m_threadId, b->m_threadId) < 0;
});
for (uint32_t i = 0; i < n;)
{
uint32_t n1 = i;
while ((n1 < n) && (strcmp(data[i]->m_threadId, data[n1]->m_threadId) == 0)) {
++n1;
}
std::sort(data + i, data + n1, [](ProfileScopeData* a, ProfileScopeData* b) {
return a->m_totalCycles > b->m_totalCycles;
});
for (uint32_t j = i; j < n1; ++j) {
ProfileScopeData* p = data[j];
LOG_INFO("%s Thread %6s | %-30s | %7.3f%% | %9.0f ns",
Tags::profiler(),
p->m_threadId,
p->m_name,
p->m_totalCycles * 100.0 / data[i]->m_totalCycles,
p->m_totalCycles / p->m_totalSamples * 1e9 / ProfileScopeData::s_tscSpeed
);
}
LOG_INFO("%s --------------|--------------------------------|----------|-------------", Tags::profiler());
i = n1;
}
# endif
LOG_INFO("%s " WHITE_BOLD("speed") " 10s/60s/15m " CYAN_BOLD("%s") CYAN(" %s %s ") CYAN_BOLD("%s") " max " CYAN_BOLD("%s %s"),
Tags::miner(),
Hashrate::format(speed[0] * scale, num, sizeof(num) / 4),
@ -311,6 +350,10 @@ xmrig::Miner::Miner(Controller *controller)
Platform::setThreadPriority(std::min(priority + 1, 5));
}
# ifdef XMRIG_FEATURE_PROFILING
ProfileScopeData::Init();
# endif
# ifdef XMRIG_ALGO_RANDOMX
Rx::init(this);
# endif

3
src/crypto/randomx/aes_hash.cpp

@ -28,6 +28,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "crypto/randomx/soft_aes.h"
#include "crypto/randomx/randomx.h"
#include "base/tools/Profiler.h"
#define AES_HASH_1R_STATE0 0xd7983aad, 0xcc82db47, 0x9fa856de, 0x92b52c0d
#define AES_HASH_1R_STATE1 0xace78057, 0xf59e125a, 0x15c7b798, 0x338d996e
@ -215,6 +216,8 @@ template void fillAes4Rx4<false>(void *state, size_t outputSize, void *buffer);
template<bool softAes>
void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) {
PROFILE_SCOPE(RandomX_AES);
uint8_t* scratchpadPtr = (uint8_t*)scratchpad;
const uint8_t* scratchpadEnd = scratchpadPtr + scratchpadSize;

2
src/crypto/randomx/blake2/blake2.h

@ -92,7 +92,7 @@ extern "C" {
int rx_blake2b_final(blake2b_state *S, void *out, size_t outlen);
/* Simple API */
int rx_blake2b(void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen);
int rx_blake2b(void *out, size_t outlen, const void *in, size_t inlen);
/* Argon2 Team - Begin Code */
int rxa2_blake2b_long(void *out, size_t outlen, const void *in, size_t inlen);

402
src/crypto/randomx/blake2/blake2b-load-sse41.h

@ -1,402 +0,0 @@
/*
BLAKE2 reference source code package - optimized C implementations
Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
your option. The terms of these licenses can be found at:
- CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
- OpenSSL license : https://www.openssl.org/source/license.html
- Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
More information about the BLAKE2 hash function can be found at
https://blake2.net.
*/
#ifndef BLAKE2B_LOAD_SSE41_H
#define BLAKE2B_LOAD_SSE41_H
#define LOAD_MSG_0_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m1); \
b1 = _mm_unpacklo_epi64(m2, m3); \
} while(0)
#define LOAD_MSG_0_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m0, m1); \
b1 = _mm_unpackhi_epi64(m2, m3); \
} while(0)
#define LOAD_MSG_0_3(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m4, m5); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_0_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m5); \
b1 = _mm_unpackhi_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_1_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m7, m2); \
b1 = _mm_unpackhi_epi64(m4, m6); \
} while(0)
#define LOAD_MSG_1_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_alignr_epi8(m3, m7, 8); \
} while(0)
#define LOAD_MSG_1_3(b0, b1) \
do \
{ \
b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
b1 = _mm_unpackhi_epi64(m5, m2); \
} while(0)
#define LOAD_MSG_1_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m6, m1); \
b1 = _mm_unpackhi_epi64(m3, m1); \
} while(0)
#define LOAD_MSG_2_1(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m6, m5, 8); \
b1 = _mm_unpackhi_epi64(m2, m7); \
} while(0)
#define LOAD_MSG_2_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m4, m0); \
b1 = _mm_blend_epi16(m1, m6, 0xF0); \
} while(0)
#define LOAD_MSG_2_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m5, m1, 0xF0); \
b1 = _mm_unpackhi_epi64(m3, m4); \
} while(0)
#define LOAD_MSG_2_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m7, m3); \
b1 = _mm_alignr_epi8(m2, m0, 8); \
} while(0)
#define LOAD_MSG_3_1(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m3, m1); \
b1 = _mm_unpackhi_epi64(m6, m5); \
} while(0)
#define LOAD_MSG_3_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m0); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_3_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m1, m2, 0xF0); \
b1 = _mm_blend_epi16(m2, m7, 0xF0); \
} while(0)
#define LOAD_MSG_3_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m3, m5); \
b1 = _mm_unpacklo_epi64(m0, m4); \
} while(0)
#define LOAD_MSG_4_1(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m2); \
b1 = _mm_unpacklo_epi64(m1, m5); \
} while(0)
#define LOAD_MSG_4_2(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m0, m3, 0xF0); \
b1 = _mm_blend_epi16(m2, m7, 0xF0); \
} while(0)
#define LOAD_MSG_4_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m7, m5, 0xF0); \
b1 = _mm_blend_epi16(m3, m1, 0xF0); \
} while(0)
#define LOAD_MSG_4_4(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m6, m0, 8); \
b1 = _mm_blend_epi16(m4, m6, 0xF0); \
} while(0)
#define LOAD_MSG_5_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m1, m3); \
b1 = _mm_unpacklo_epi64(m0, m4); \
} while(0)
#define LOAD_MSG_5_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m6, m5); \
b1 = _mm_unpackhi_epi64(m5, m1); \
} while(0)
#define LOAD_MSG_5_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m2, m3, 0xF0); \
b1 = _mm_unpackhi_epi64(m7, m0); \
} while(0)
#define LOAD_MSG_5_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m6, m2); \
b1 = _mm_blend_epi16(m7, m4, 0xF0); \
} while(0)
#define LOAD_MSG_6_1(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m6, m0, 0xF0); \
b1 = _mm_unpacklo_epi64(m7, m2); \
} while(0)
#define LOAD_MSG_6_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m2, m7); \
b1 = _mm_alignr_epi8(m5, m6, 8); \
} while(0)
#define LOAD_MSG_6_3(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m3); \
b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \
} while(0)
#define LOAD_MSG_6_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m3, m1); \
b1 = _mm_blend_epi16(m1, m5, 0xF0); \
} while(0)
#define LOAD_MSG_7_1(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m6, m3); \
b1 = _mm_blend_epi16(m6, m1, 0xF0); \
} while(0)
#define LOAD_MSG_7_2(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m7, m5, 8); \
b1 = _mm_unpackhi_epi64(m0, m4); \
} while(0)
#define LOAD_MSG_7_3(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m2, m7); \
b1 = _mm_unpacklo_epi64(m4, m1); \
} while(0)
#define LOAD_MSG_7_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m2); \
b1 = _mm_unpacklo_epi64(m3, m5); \
} while(0)
#define LOAD_MSG_8_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m3, m7); \
b1 = _mm_alignr_epi8(m0, m5, 8); \
} while(0)
#define LOAD_MSG_8_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m7, m4); \
b1 = _mm_alignr_epi8(m4, m1, 8); \
} while(0)
#define LOAD_MSG_8_3(b0, b1) \
do \
{ \
b0 = m6; \
b1 = _mm_alignr_epi8(m5, m0, 8); \
} while(0)
#define LOAD_MSG_8_4(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m1, m3, 0xF0); \
b1 = m2; \
} while(0)
#define LOAD_MSG_9_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_unpackhi_epi64(m3, m0); \
} while(0)
#define LOAD_MSG_9_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m1, m2); \
b1 = _mm_blend_epi16(m3, m2, 0xF0); \
} while(0)
#define LOAD_MSG_9_3(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m7, m4); \
b1 = _mm_unpackhi_epi64(m1, m6); \
} while(0)
#define LOAD_MSG_9_4(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m7, m5, 8); \
b1 = _mm_unpacklo_epi64(m6, m0); \
} while(0)
#define LOAD_MSG_10_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m1); \
b1 = _mm_unpacklo_epi64(m2, m3); \
} while(0)
#define LOAD_MSG_10_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m0, m1); \
b1 = _mm_unpackhi_epi64(m2, m3); \
} while(0)
#define LOAD_MSG_10_3(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m4, m5); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_10_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m5); \
b1 = _mm_unpackhi_epi64(m6, m7); \
} while(0)
#define LOAD_MSG_11_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m7, m2); \
b1 = _mm_unpackhi_epi64(m4, m6); \
} while(0)
#define LOAD_MSG_11_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_alignr_epi8(m3, m7, 8); \
} while(0)
#define LOAD_MSG_11_3(b0, b1) \
do \
{ \
b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
b1 = _mm_unpackhi_epi64(m5, m2); \
} while(0)
#define LOAD_MSG_11_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m6, m1); \
b1 = _mm_unpackhi_epi64(m3, m1); \
} while(0)
#endif

14
src/crypto/randomx/blake2/blake2b-round.h

@ -102,17 +102,21 @@
row4l = t1; \
row4h = t0;
#include "blake2b-load-sse41.h"
#define LOAD_MSG(r, i, b0, b1) \
do { \
b0 = _mm_set_epi64x(m[blake2b_sigma_sse41[r][i * 4 + 1]], m[blake2b_sigma_sse41[r][i * 4 + 0]]); \
b1 = _mm_set_epi64x(m[blake2b_sigma_sse41[r][i * 4 + 3]], m[blake2b_sigma_sse41[r][i * 4 + 2]]); \
} while(0)
#define ROUND(r) \
LOAD_MSG_ ##r ##_1(b0, b1); \
LOAD_MSG(r, 0, b0, b1); \
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
LOAD_MSG_ ##r ##_2(b0, b1); \
LOAD_MSG(r, 1, b0, b1); \
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
LOAD_MSG_ ##r ##_3(b0, b1); \
LOAD_MSG(r, 2, b0, b1); \
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
LOAD_MSG_ ##r ##_4(b0, b1); \
LOAD_MSG(r, 3, b0, b1); \
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);

98
src/crypto/randomx/blake2/blake2b.c

@ -56,6 +56,23 @@ static const uint64_t blake2b_IV[8] = {
UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f),
UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179) };
#if defined(_M_X64) || defined(__x86_64__)
static const uint8_t blake2b_sigma_sse41[12][16] = {
{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15},
{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3},
{11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4},
{7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8},
{9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13},
{2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9},
{12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11},
{13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10},
{6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5},
{10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0},
{0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15},
{14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3},
};
#endif
static const uint8_t blake2b_sigma[12][16] = {
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
@ -203,15 +220,6 @@ static void rx_blake2b_compress_sse41(blake2b_state* S, const uint8_t *block)
const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
const __m128i m0 = LOADU(block + 00);
const __m128i m1 = LOADU(block + 16);
const __m128i m2 = LOADU(block + 32);
const __m128i m3 = LOADU(block + 48);
const __m128i m4 = LOADU(block + 64);
const __m128i m5 = LOADU(block + 80);
const __m128i m6 = LOADU(block + 96);
const __m128i m7 = LOADU(block + 112);
row1l = LOADU(&S->h[0]);
row1h = LOADU(&S->h[2]);
row2l = LOADU(&S->h[4]);
@ -221,18 +229,11 @@ static void rx_blake2b_compress_sse41(blake2b_state* S, const uint8_t *block)
row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0]));
row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0]));
ROUND(0);
ROUND(1);
ROUND(2);
ROUND(3);
ROUND(4);
ROUND(5);
ROUND(6);
ROUND(7);
ROUND(8);
ROUND(9);
ROUND(10);
ROUND(11);
const uint64_t* m = (const uint64_t*)(block);
for (uint32_t r = 0; r < 12; ++r) {
ROUND(r);
}
row1l = _mm_xor_si128(row3l, row1l);
row1h = _mm_xor_si128(row3h, row1h);
@ -388,8 +389,7 @@ int rx_blake2b_final(blake2b_state *S, void *out, size_t outlen) {
return 0;
}
int rx_blake2b(void *out, size_t outlen, const void *in, size_t inlen,
const void *key, size_t keylen) {
int rx_blake2b(void *out, size_t outlen, const void *in, size_t inlen) {
blake2b_state S;
int ret = -1;
@ -402,25 +402,14 @@ int rx_blake2b(void *out, size_t outlen, const void *in, size_t inlen,
goto fail;
}
if ((NULL == key && keylen > 0) || keylen > BLAKE2B_KEYBYTES) {
if (rx_blake2b_init(&S, outlen) < 0) {
goto fail;
}
if (keylen > 0) {
if (rx_blake2b_init_key(&S, outlen, key, keylen) < 0) {
goto fail;
}
}
else {
if (rx_blake2b_init(&S, outlen) < 0) {
goto fail;
}
}
if (rx_blake2b_update(&S, in, inlen) < 0) {
if (rx_blake2b_update(&S, in, inlen) < 0) {
goto fail;
}
ret = rx_blake2b_final(&S, out, outlen);
ret = rx_blake2b_final(&S, out, outlen);
fail:
//clear_internal_memory(&S, sizeof(S));
@ -442,43 +431,42 @@ int rxa2_blake2b_long(void *pout, size_t outlen, const void *in, size_t inlen) {
store32(outlen_bytes, (uint32_t)outlen);
#define TRY(statement) \
do { \
ret = statement; \
if (ret < 0) { \
goto fail; \
} \
} while ((void)0, 0)
do { \
ret = statement; \
if (ret < 0) { \
goto fail; \
} \
} while ((void)0, 0)
if (outlen <= BLAKE2B_OUTBYTES) {
TRY(rx_blake2b_init(&blake_state, outlen));
TRY(rx_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes)));
TRY(rx_blake2b_update(&blake_state, in, inlen));
TRY(rx_blake2b_final(&blake_state, out, outlen));
TRY(rx_blake2b_init(&blake_state, outlen));
TRY(rx_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes)));
TRY(rx_blake2b_update(&blake_state, in, inlen));
TRY(rx_blake2b_final(&blake_state, out, outlen));
}
else {
uint32_t toproduce;
uint8_t out_buffer[BLAKE2B_OUTBYTES];
uint8_t in_buffer[BLAKE2B_OUTBYTES];
TRY(rx_blake2b_init(&blake_state, BLAKE2B_OUTBYTES));
TRY(rx_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes)));
TRY(rx_blake2b_update(&blake_state, in, inlen));
TRY(rx_blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES));
TRY(rx_blake2b_init(&blake_state, BLAKE2B_OUTBYTES));
TRY(rx_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes)));
TRY(rx_blake2b_update(&blake_state, in, inlen));
TRY(rx_blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES));
memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
out += BLAKE2B_OUTBYTES / 2;
toproduce = (uint32_t)outlen - BLAKE2B_OUTBYTES / 2;
while (toproduce > BLAKE2B_OUTBYTES) {
memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
TRY(rx_blake2b(out_buffer, BLAKE2B_OUTBYTES, in_buffer,
BLAKE2B_OUTBYTES, NULL, 0));
TRY(rx_blake2b(out_buffer, BLAKE2B_OUTBYTES, in_buffer,
BLAKE2B_OUTBYTES));
memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
out += BLAKE2B_OUTBYTES / 2;
toproduce -= BLAKE2B_OUTBYTES / 2;
}
memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
TRY(rx_blake2b(out_buffer, toproduce, in_buffer, BLAKE2B_OUTBYTES, NULL,
0));
TRY(rx_blake2b(out_buffer, toproduce, in_buffer, BLAKE2B_OUTBYTES));
memcpy(out, out_buffer, toproduce);
}
fail:

2
src/crypto/randomx/blake2_generator.cpp

@ -55,7 +55,7 @@ namespace randomx {
void Blake2Generator::checkData(const size_t bytesNeeded) {
if (dataIndex + bytesNeeded > sizeof(data)) {
rx_blake2b(data, sizeof(data), data, sizeof(data), nullptr, 0);
rx_blake2b(data, sizeof(data), data, sizeof(data));
dataIndex = 0;
}
}

3
src/crypto/randomx/jit_compiler_x86.cpp

@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "crypto/randomx/program.hpp"
#include "crypto/randomx/reciprocal.h"
#include "crypto/randomx/virtual_memory.hpp"
#include "base/tools/Profiler.h"
#ifdef XMRIG_FIX_RYZEN
# include "crypto/rx/Rx.h"
@ -255,6 +256,8 @@ namespace randomx {
}
void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg, uint32_t flags) {
PROFILE_SCOPE(RandomX_JIT_compile);
vm_flags = flags;
generateProgramPrologue(prog, pcfg);

18
src/crypto/randomx/randomx.cpp

@ -47,6 +47,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <cassert>
#include "base/tools/Profiler.h"
RandomX_ConfigurationWownero::RandomX_ConfigurationWownero()
{
ArgonSalt = "RandomWOW\x01";
@ -574,33 +576,35 @@ extern "C" {
assert(inputSize == 0 || input != nullptr);
assert(output != nullptr);
alignas(16) uint64_t tempHash[8];
rx_blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0);
rx_blake2b_wrapper::run(tempHash, sizeof(tempHash), input, inputSize);
machine->initScratchpad(&tempHash);
machine->resetRoundingMode();
for (uint32_t chain = 0; chain < RandomX_CurrentConfig.ProgramCount - 1; ++chain) {
machine->run(&tempHash);
rx_blake2b(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0);
rx_blake2b_wrapper::run(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile));
}
machine->run(&tempHash);
machine->getFinalResult(output, RANDOMX_HASH_SIZE);
machine->getFinalResult(output);
}
void randomx_calculate_hash_first(randomx_vm* machine, uint64_t (&tempHash)[8], const void* input, size_t inputSize) {
rx_blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0);
rx_blake2b_wrapper::run(tempHash, sizeof(tempHash), input, inputSize);
machine->initScratchpad(tempHash);
}
void randomx_calculate_hash_next(randomx_vm* machine, uint64_t (&tempHash)[8], const void* nextInput, size_t nextInputSize, void* output) {
PROFILE_SCOPE(RandomX_hash);
machine->resetRoundingMode();
for (uint32_t chain = 0; chain < RandomX_CurrentConfig.ProgramCount - 1; ++chain) {
machine->run(&tempHash);
rx_blake2b(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0);
rx_blake2b_wrapper::run(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile));
}
machine->run(&tempHash);
// Finish current hash and fill the scratchpad for the next hash at the same time
rx_blake2b(tempHash, sizeof(tempHash), nextInput, nextInputSize, nullptr, 0);
machine->hashAndFill(output, RANDOMX_HASH_SIZE, tempHash);
rx_blake2b_wrapper::run(tempHash, sizeof(tempHash), nextInput, nextInputSize);
machine->hashAndFill(output, tempHash);
}
}

10
src/crypto/randomx/virtual_machine.cpp

@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "crypto/randomx/blake2/blake2.h"
#include "crypto/randomx/intrin_portable.h"
#include "crypto/randomx/allocator.hpp"
#include "base/tools/Profiler.h"
randomx_vm::~randomx_vm() {
@ -109,15 +110,15 @@ namespace randomx {
}
template<bool softAes>
void VmBase<softAes>::getFinalResult(void* out, size_t outSize) {
void VmBase<softAes>::getFinalResult(void* out) {
hashAes1Rx4<softAes>(scratchpad, ScratchpadSize, &reg.a);
rx_blake2b(out, outSize, &reg, sizeof(RegisterFile), nullptr, 0);
rx_blake2b_wrapper::run(out, RANDOMX_HASH_SIZE, &reg, sizeof(RegisterFile));
}
template<bool softAes>
void VmBase<softAes>::hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) {
void VmBase<softAes>::hashAndFill(void* out, uint64_t (&fill_state)[8]) {
hashAndFillAes1Rx4<softAes>(scratchpad, ScratchpadSize, &reg.a, fill_state);
rx_blake2b(out, outSize, &reg, sizeof(RegisterFile), nullptr, 0);
rx_blake2b_wrapper::run(out, RANDOMX_HASH_SIZE, &reg, sizeof(RegisterFile));
}
template<bool softAes>
@ -127,6 +128,7 @@ namespace randomx {
template<bool softAes>
void VmBase<softAes>::generateProgram(void* seed) {
PROFILE_SCOPE(RandomX_generate_program);
fillAes4Rx4<softAes>(seed, 128 + RandomX_CurrentConfig.ProgramSize * 8, &program);
}

8
src/crypto/randomx/virtual_machine.hpp

@ -38,8 +38,8 @@ class randomx_vm
public:
virtual ~randomx_vm() = 0;
virtual void setScratchpad(uint8_t *scratchpad) = 0;
virtual void getFinalResult(void* out, size_t outSize) = 0;
virtual void hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) = 0;
virtual void getFinalResult(void* out) = 0;
virtual void hashAndFill(void* out, uint64_t (&fill_state)[8]) = 0;
virtual void setDataset(randomx_dataset* dataset) { }
virtual void setCache(randomx_cache* cache) { }
virtual void initScratchpad(void* seed) = 0;
@ -86,8 +86,8 @@ namespace randomx {
~VmBase() override;
void setScratchpad(uint8_t *scratchpad) override;
void initScratchpad(void* seed) override;
void getFinalResult(void* out, size_t outSize) override;
void hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) override;
void getFinalResult(void* out) override;
void hashAndFill(void* out, uint64_t (&fill_state)[8]) override;
protected:
void generateProgram(void* seed);

5
src/crypto/randomx/vm_compiled.cpp

@ -28,6 +28,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "crypto/randomx/vm_compiled.hpp"
#include "crypto/randomx/common.hpp"
#include "base/tools/Profiler.h"
namespace randomx {
@ -41,6 +42,8 @@ namespace randomx {
template<bool softAes>
void CompiledVm<softAes>::run(void* seed) {
PROFILE_SCOPE(RandomX_run);
compiler.prepare();
VmBase<softAes>::generateProgram(seed);
randomx_vm::initialize();
@ -51,6 +54,8 @@ namespace randomx {
template<bool softAes>
void CompiledVm<softAes>::execute() {
PROFILE_SCOPE(RandomX_JIT_execute);
#ifdef XMRIG_ARM
memcpy(reg.f, config.eMask, sizeof(config.eMask));
#endif

4
src/crypto/rx/RxConfig.cpp

@ -120,8 +120,8 @@ bool xmrig::RxConfig::read(const rapidjson::Value &value)
}
# endif
const int mode = Json::getInt(value, kScratchpadPrefetchMode, static_cast<int>(m_scratchpadPrefetchMode));
if ((mode >= ScratchpadPrefetchOff) && (mode < ScratchpadPrefetchMax)) {
const uint32_t mode = static_cast<uint32_t>(Json::getInt(value, kScratchpadPrefetchMode, static_cast<int>(m_scratchpadPrefetchMode)));
if (mode < ScratchpadPrefetchMax) {
m_scratchpadPrefetchMode = static_cast<ScratchpadPrefetchMode>(mode);
}

Loading…
Cancel
Save