From f48181a1207ba8c325bfb6c88b03ec2521c548a0 Mon Sep 17 00:00:00 2001 From: Duke Date: Wed, 25 Oct 2023 13:16:47 -0400 Subject: [PATCH 01/15] Add help for -consolidationinterval which was missing --- src/init.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/init.cpp b/src/init.cpp index f49e146dd..20ef95b45 100644 --- a/src/init.cpp +++ b/src/init.cpp @@ -458,6 +458,7 @@ std::string HelpMessage(HelpMessageMode mode) strUsage += HelpMessageOpt("-disablewallet", _("Do not load the wallet and disable wallet RPC calls")); strUsage += HelpMessageOpt("-keypool=", strprintf(_("Set key pool size to (default: %u)"), 100)); strUsage += HelpMessageOpt("-consolidation", _("Enable auto Sapling note consolidation (default: false)")); + strUsage += HelpMessageOpt("-consolidationinterval", _("Block interval between consolidations (default: 25)")); strUsage += HelpMessageOpt("-consolidatesaplingaddress=", _("Specify Sapling Address to Consolidate. (default: all)")); strUsage += HelpMessageOpt("-consolidationtxfee", strprintf(_("Fee amount in Puposhis used send consolidation transactions. (default %i)"), DEFAULT_CONSOLIDATION_FEE)); From b73297f1f52892933d64e978df7800009b06b5eb Mon Sep 17 00:00:00 2001 From: Duke Date: Wed, 25 Oct 2023 13:17:10 -0400 Subject: [PATCH 02/15] More debugging when resending txs --- src/wallet/wallet.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/wallet/wallet.cpp b/src/wallet/wallet.cpp index 43f4b5b2a..a615d5bdc 100644 --- a/src/wallet/wallet.cpp +++ b/src/wallet/wallet.cpp @@ -3116,6 +3116,7 @@ std::vector CWallet::ResendWalletTransactionsBefore(int64_t nTime) // Sort them in chronological order multimap mapSorted; uint32_t now = (uint32_t)time(NULL); + LogPrintf("%s: nTime=%ld now=%d\n", __func__, nTime, now); // vector of wallet transactions to delete std::vector vwtxh; @@ -3183,6 +3184,7 @@ std::vector CWallet::ResendWalletTransactionsBefore(int64_t nTime) void CWallet::ResendWalletTransactions(int64_t nBestBlockTime) { + LogPrintf("%s: nBestBlockTime=%ld nNextResend=%ld nLastResend=%ld time=%ld\n", __func__, nBestBlockTime, nNextResend, nLastResend, GetTime()); // Do this infrequently and randomly to avoid giving away // that these are our transactions. if (GetTime() < nNextResend || !fBroadcastTransactions) @@ -3207,15 +3209,11 @@ void CWallet::ResendWalletTransactions(int64_t nBestBlockTime) /** @} */ // end of mapWallet - - - /** @defgroup Actions * * @{ */ - CAmount CWallet::GetBalance() const { CAmount nTotal = 0; From f0395196ec8bbd2b5e01e52dc01bfe5e474e30c0 Mon Sep 17 00:00:00 2001 From: Duke Date: Sun, 29 Oct 2023 21:45:30 -0400 Subject: [PATCH 03/15] Do not resend wallet txs during IBD, rescan or loading blocks --- src/wallet/wallet.cpp | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/wallet/wallet.cpp b/src/wallet/wallet.cpp index a615d5bdc..3071d46a7 100644 --- a/src/wallet/wallet.cpp +++ b/src/wallet/wallet.cpp @@ -67,6 +67,7 @@ bool fPayAtLeastCustomFee = true; CBlockIndex *hush_chainactive(int32_t height); extern std::string DONATION_PUBKEY; +extern int32_t HUSH_LOADINGBLOCKS; int32_t hush_dpowconfs(int32_t height,int32_t numconfs); int tx_height( const uint256 &hash ); bool fTxDeleteEnabled = false; @@ -3184,7 +3185,6 @@ std::vector CWallet::ResendWalletTransactionsBefore(int64_t nTime) void CWallet::ResendWalletTransactions(int64_t nBestBlockTime) { - LogPrintf("%s: nBestBlockTime=%ld nNextResend=%ld nLastResend=%ld time=%ld\n", __func__, nBestBlockTime, nNextResend, nLastResend, GetTime()); // Do this infrequently and randomly to avoid giving away // that these are our transactions. if (GetTime() < nNextResend || !fBroadcastTransactions) @@ -3195,11 +3195,26 @@ void CWallet::ResendWalletTransactions(int64_t nBestBlockTime) if (fFirst) return; + // do not resend during IBD/rescan because some txs will be unconfirmed + // until completion + if(IsInitialBlockDownload()) { + return; + } + if (pwalletMain->fRescanning) { + return; + } + + // do not resend during a reindex or initial loading of blocks + if (HUSH_LOADINGBLOCKS) { + return; + } + // Only do it if there's been a new block since last time if (nBestBlockTime < nLastResend) return; nLastResend = GetTime(); + LogPrintf("%s: nBestBlockTime=%ld nNextResend=%ld nLastResend=%ld time=%ld\n", __func__, nBestBlockTime, nNextResend, nLastResend, GetTime()); // Rebroadcast unconfirmed txes older than 5 minutes before the last // block was found: std::vector relayed = ResendWalletTransactionsBefore(nBestBlockTime-5*60); From 7eb9d75b94469c3fc8c028f29b35be9ac764a10c Mon Sep 17 00:00:00 2001 From: Duke Date: Mon, 30 Oct 2023 09:31:48 -0400 Subject: [PATCH 04/15] Support * or ANY_ZADDR in z_mergetoaddress --- src/wallet/rpcwallet.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/wallet/rpcwallet.cpp b/src/wallet/rpcwallet.cpp index 8467c0453..fb4f24eb3 100644 --- a/src/wallet/rpcwallet.cpp +++ b/src/wallet/rpcwallet.cpp @@ -5667,8 +5667,10 @@ UniValue z_mergetoaddress(const UniValue& params, bool fHelp, const CPubKey& myp if (address == "ANY_TADDR") { useAnyUTXO = true; - } else if (address == "ANY_SAPLING") { + } else if (address == "ANY_ZADDR" || address == "ANY_SAPLING") { useAnySapling = true; + } else if (address == "*") { + useAnyUTXO = useAnySapling = true; } else { CTxDestination taddr = DecodeDestination(address); if (IsValidDestination(taddr)) { @@ -5692,7 +5694,7 @@ UniValue z_mergetoaddress(const UniValue& params, bool fHelp, const CPubKey& myp throw JSONRPCError(RPC_INVALID_PARAMETER, "Cannot specify specific t-addrs when using \"ANY_TADDR\""); } if ((useAnySapling) && zaddrs.size() > 0) { - throw JSONRPCError(RPC_INVALID_PARAMETER, "Cannot specify specific z-addrs when using \"ANY_SAPLING\""); + throw JSONRPCError(RPC_INVALID_PARAMETER, "Cannot specify specific z-addrs when using \"ANY_SAPLING\" or \"ANY_ZADDR\""); } const int nextBlockHeight = chainActive.Height() + 1; From a554377225687fbca3680f8cc43e30a3133aad12 Mon Sep 17 00:00:00 2001 From: Duke Date: Tue, 31 Oct 2023 10:30:18 -0400 Subject: [PATCH 05/15] Clean up --- src/validationinterface.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/validationinterface.cpp b/src/validationinterface.cpp index 12cabfac5..8db9e25f3 100644 --- a/src/validationinterface.cpp +++ b/src/validationinterface.cpp @@ -135,9 +135,6 @@ void ThreadNotifyWallets(CBlockIndex *pindexLastTip) while (pindex && pindex != pindexFork) { // Get the Sprout commitment tree as of the start of this block. SproutMerkleTree oldSproutTree; - //TODO: how important is oldSproutTree ? - //assert(pcoinsTip->GetSproutAnchorAt(pindex->hashSproutAnchor, oldSproutTree)); - // Get the Sapling commitment tree as of the start of this block. // We can get this from the `hashFinalSaplingRoot` of the last block // However, this is only reliable if the last block was on or after From 0936d4c2c2f771dbec6d7614991b4a96ce87b80d Mon Sep 17 00:00:00 2001 From: Duke Date: Mon, 6 Nov 2023 11:53:59 -0500 Subject: [PATCH 06/15] Add option to disable automatic resending of txs This option can disable automatic resending of txs. It defaults to the normal behavior of resending txs automatically and resending can be disabled with -resendtx=0 . This is not intended for normal usage and is undocumented for now. It is useful for devs who are debugging internals and also when rescanning large wallets. We seem to have either some bugs and/or performance problems when rescanning wallets with many txs. --- src/validationinterface.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/validationinterface.cpp b/src/validationinterface.cpp index 8db9e25f3..62ba5ea80 100644 --- a/src/validationinterface.cpp +++ b/src/validationinterface.cpp @@ -32,7 +32,13 @@ void RegisterValidationInterface(CValidationInterface* pwalletIn) { g_signals.UpdatedTransaction.connect(boost::bind(&CValidationInterface::UpdatedTransaction, pwalletIn, _1)); g_signals.ChainTip.connect(boost::bind(&CValidationInterface::ChainTip, pwalletIn, _1, _2, _3)); g_signals.SetBestChain.connect(boost::bind(&CValidationInterface::SetBestChain, pwalletIn, _1)); - g_signals.Broadcast.connect(boost::bind(&CValidationInterface::ResendWalletTransactions, pwalletIn, _1)); + + if(GetArg("-resendtx", true)) { + g_signals.Broadcast.connect(boost::bind(&CValidationInterface::ResendWalletTransactions, pwalletIn, _1)); + } else { + LogPrintf("%s: automatic resending of wallet transactions disabled\n", __func__); + } + g_signals.BlockChecked.connect(boost::bind(&CValidationInterface::BlockChecked, pwalletIn, _1, _2)); //g_signals.ScriptForMining.connect(boost::bind(&CValidationInterface::GetScriptForMining, pwalletIn, _1)); } @@ -40,7 +46,9 @@ void RegisterValidationInterface(CValidationInterface* pwalletIn) { void UnregisterValidationInterface(CValidationInterface* pwalletIn) { //g_signals.ScriptForMining.disconnect(boost::bind(&CValidationInterface::GetScriptForMining, pwalletIn, _1)); g_signals.BlockChecked.disconnect(boost::bind(&CValidationInterface::BlockChecked, pwalletIn, _1, _2)); - g_signals.Broadcast.disconnect(boost::bind(&CValidationInterface::ResendWalletTransactions, pwalletIn, _1)); + if(GetArg("-resendtx", true)) { + g_signals.Broadcast.disconnect(boost::bind(&CValidationInterface::ResendWalletTransactions, pwalletIn, _1)); + } g_signals.ChainTip.disconnect(boost::bind(&CValidationInterface::ChainTip, pwalletIn, _1, _2, _3)); g_signals.SetBestChain.disconnect(boost::bind(&CValidationInterface::SetBestChain, pwalletIn, _1)); g_signals.UpdatedTransaction.disconnect(boost::bind(&CValidationInterface::UpdatedTransaction, pwalletIn, _1)); From 6029b3d571009991ae9c4aea0397f4d00be6a817 Mon Sep 17 00:00:00 2001 From: Duke Date: Tue, 7 Nov 2023 09:09:48 -0500 Subject: [PATCH 07/15] Update to RandomX v1.2.1 Commit 102f8acf90a7649ada410de5499a7ec62e49e1da --- src/RandomX/CMakeLists.txt | 40 +- src/RandomX/README.md | 5 +- src/RandomX/doc/tevador.asc | 18 +- src/RandomX/src/allocator.cpp | 2 +- src/RandomX/src/assembly_generator_x86.cpp | 2 +- src/RandomX/src/bytecode_machine.cpp | 2 +- src/RandomX/src/common.hpp | 7 + src/RandomX/src/configuration.h | 10 +- src/RandomX/src/dataset.cpp | 2 +- src/RandomX/src/intrin_portable.h | 12 +- src/RandomX/src/jit_compiler.hpp | 42 +- src/RandomX/src/jit_compiler_a64.cpp | 74 +- src/RandomX/src/jit_compiler_a64.hpp | 2 +- src/RandomX/src/jit_compiler_a64_static.S | 98 +- src/RandomX/src/jit_compiler_rv64.cpp | 1175 ++++++++++++++++ src/RandomX/src/jit_compiler_rv64.hpp | 69 + src/RandomX/src/jit_compiler_rv64_static.S | 1235 +++++++++++++++++ src/RandomX/src/jit_compiler_rv64_static.hpp | 53 + src/RandomX/src/jit_compiler_x86.cpp | 4 +- src/RandomX/src/randomx.cpp | 28 + src/RandomX/src/randomx.h | 11 + src/RandomX/src/reciprocal.c | 34 +- src/RandomX/src/reciprocal.h | 4 +- src/RandomX/src/tests/benchmark.cpp | 32 +- src/RandomX/src/tests/perf-simulation.cpp | 2 +- src/RandomX/src/tests/riscv64_zba.s | 9 + src/RandomX/src/tests/riscv64_zbb.s | 9 + src/RandomX/src/tests/tests.cpp | 24 + .../{virtual_memory.cpp => virtual_memory.c} | 153 +- .../{virtual_memory.hpp => virtual_memory.h} | 26 +- src/RandomX/vcxproj/randomx-dll.vcxproj | 4 +- .../vcxproj/randomx-dll.vcxproj.filters | 4 +- src/RandomX/vcxproj/randomx.vcxproj | 4 +- src/RandomX/vcxproj/randomx.vcxproj.filters | 4 +- 34 files changed, 2966 insertions(+), 234 deletions(-) create mode 100644 src/RandomX/src/jit_compiler_rv64.cpp create mode 100644 src/RandomX/src/jit_compiler_rv64.hpp create mode 100644 src/RandomX/src/jit_compiler_rv64_static.S create mode 100644 src/RandomX/src/jit_compiler_rv64_static.hpp create mode 100644 src/RandomX/src/tests/riscv64_zba.s create mode 100644 src/RandomX/src/tests/riscv64_zbb.s rename src/RandomX/src/{virtual_memory.cpp => virtual_memory.c} (54%) rename src/RandomX/src/{virtual_memory.hpp => virtual_memory.h} (80%) diff --git a/src/RandomX/CMakeLists.txt b/src/RandomX/CMakeLists.txt index f41f606b9..ebbdff2b6 100644 --- a/src/RandomX/CMakeLists.txt +++ b/src/RandomX/CMakeLists.txt @@ -39,7 +39,7 @@ src/bytecode_machine.cpp src/cpu.cpp src/dataset.cpp src/soft_aes.cpp -src/virtual_memory.cpp +src/virtual_memory.c src/vm_interpreted.cpp src/allocator.cpp src/assembly_generator_x86.cpp @@ -96,7 +96,7 @@ function(add_flag flag) endfunction() # x86-64 -if(ARCH_ID STREQUAL "x86_64" OR ARCH_ID STREQUAL "x86-64" OR ARCH_ID STREQUAL "amd64") +if ((CMAKE_SIZEOF_VOID_P EQUAL 8) AND (ARCH_ID STREQUAL "x86_64" OR ARCH_ID STREQUAL "x86-64" OR ARCH_ID STREQUAL "amd64")) list(APPEND randomx_sources src/jit_compiler_x86.cpp) @@ -173,6 +173,42 @@ if(ARM_ID STREQUAL "aarch64" OR ARM_ID STREQUAL "arm64" OR ARM_ID STREQUAL "armv endif() endif() +# RISC-V +if(ARCH_ID STREQUAL "riscv64") + list(APPEND randomx_sources + src/jit_compiler_rv64_static.S + src/jit_compiler_rv64.cpp) + # cheat because cmake and ccache hate each other + set_property(SOURCE src/jit_compiler_rv64_static.S PROPERTY LANGUAGE C) + set_property(SOURCE src/jit_compiler_rv64_static.S PROPERTY XCODE_EXPLICIT_FILE_TYPE sourcecode.asm) + + # default build uses the RV64GC baseline + set(RVARCH "rv64gc") + + # for native builds, enable Zba and Zbb if supported by the CPU + if(ARCH STREQUAL "native") + enable_language(ASM) + try_run(RANDOMX_ZBA_RUN_FAIL + RANDOMX_ZBA_COMPILE_OK + ${CMAKE_CURRENT_BINARY_DIR}/ + ${CMAKE_CURRENT_SOURCE_DIR}/src/tests/riscv64_zba.s + COMPILE_DEFINITIONS "-march=rv64gc_zba") + if (RANDOMX_ZBA_COMPILE_OK AND NOT RANDOMX_ZBA_RUN_FAIL) + set(RVARCH "${RVARCH}_zba") + endif() + try_run(RANDOMX_ZBB_RUN_FAIL + RANDOMX_ZBB_COMPILE_OK + ${CMAKE_CURRENT_BINARY_DIR}/ + ${CMAKE_CURRENT_SOURCE_DIR}/src/tests/riscv64_zbb.s + COMPILE_DEFINITIONS "-march=rv64gc_zbb") + if (RANDOMX_ZBB_COMPILE_OK AND NOT RANDOMX_ZBB_RUN_FAIL) + set(RVARCH "${RVARCH}_zbb") + endif() + endif() + + add_flag("-march=${RVARCH}") +endif() + set(RANDOMX_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/src" CACHE STRING "RandomX Include path") add_library(randomx ${randomx_sources}) diff --git a/src/RandomX/README.md b/src/RandomX/README.md index 4c1dabb65..2c9bdd318 100644 --- a/src/RandomX/README.md +++ b/src/RandomX/README.md @@ -37,7 +37,7 @@ RandomX is written in C++11 and builds a static library with a C API provided by ### Linux -Build dependencies: `cmake` (minimum 2.8.7) and `gcc` (minimum version 4.8, but version 7+ is recommended). +Build dependencies: `cmake` (minimum 3.5) and `gcc` (minimum version 4.8, but version 7+ is recommended). To build optimized binaries for your machine, run: ``` @@ -82,7 +82,7 @@ Intel Core i7-8550U|16G DDR4-2400|Windows 10|hw|200 (4T)|1700 (4T)|350 (8T)| Intel Core i3-3220|4G DDR3-1333|Ubuntu 16.04|soft|42 (4T)|510 (4T)|150 (4T)| Raspberry Pi 3|1G LPDDR2|Ubuntu 16.04|soft|3.5 (4T)|-|20 (4T)| -Note that RandomX currently includes a JIT compiler for x86-64 and ARM64. Other architectures have to use the portable interpreter, which is much slower. +Note that RandomX currently includes a JIT compiler for x86-64, ARM64 and RISCV64. Other architectures have to use the portable interpreter, which is much slower. ### GPU performance @@ -129,6 +129,7 @@ The reference implementation has been validated on the following platforms: * ARMv7+VFPv3 (32-bit, little-endian) * ARMv8 (64-bit, little-endian) * PPC64 (64-bit, big-endian) +* RISCV64 (64-bit, little-endian) ### Can FPGAs mine RandomX? diff --git a/src/RandomX/doc/tevador.asc b/src/RandomX/doc/tevador.asc index b998f1ef2..8bada54bb 100644 --- a/src/RandomX/doc/tevador.asc +++ b/src/RandomX/doc/tevador.asc @@ -1,13 +1,13 @@ -----BEGIN PGP PUBLIC KEY BLOCK----- mDMEXd+PeBYJKwYBBAHaRw8BAQdAZ0nqJ+nRYoScG2QLX62pl+WO1+Mkv6Yyt2Kb -ntGUuLq0G3RldmFkb3IgPHRldmFkb3JAZ21haWwuY29tPoiWBBMWCAA+FiEEMoWj -LVEwdmMs6CUQWijIaue9c6YFAl3fj3gCGwMFCQWnqDgFCwkIBwIGFQoJCAsCBBYC -AwECHgECF4AACgkQWijIaue9c6YBFQD+N1XTUqSCZp9jB/yTHQ9ahSaIUMtmuvdT -So2s+quudP4A/R5wLwukpfGN9UZ4cfpmKCJ9jO1HJ2udmlGMsJbQpDAIuDgEXd+P +ntGUuLq0G3RldmFkb3IgPHRldmFkb3JAZ21haWwuY29tPoiWBBMWCAA+AhsDBQsJ +CAcCBhUKCQgLAgQWAgMBAh4BAheAFiEEMoWjLVEwdmMs6CUQWijIaue9c6YFAmRP +r8MFCQ/ZS2YACgkQWijIaue9c6bR5gEA0tnQ4Al+yOLoRUBQitAV8FU4FLy8Xx8U +IyyivjJ0UhIA/2jwJfMXmJdMKtar8xfIA5mZLLofkEP6hug4knhitpkBuDgEXd+P eBIKKwYBBAGXVQEFAQEHQBNbQuPcDojMCkRb5B5u7Ld/AFLClOh+6ElL+u61rIY/ -AwEIB4h+BBgWCAAmFiEEMoWjLVEwdmMs6CUQWijIaue9c6YFAl3fj3gCGwwFCQWn -qDgACgkQWijIaue9c6YJvgD+IY1Q9mCM1P1iZIoXuafRihXJ7UgVXpQqW2yoaUT3 -bfQA/RkisI2eElYoOjdwPszPP6VfL5+SViwDmDuJG2P5llgE -=V4vd ------END PGP PUBLIC KEY BLOCK----- +AwEIB4h+BBgWCAAmAhsMFiEEMoWjLVEwdmMs6CUQWijIaue9c6YFAmRQoAMFCQ/Z +S2YACgkQWijIaue9c6bUfwD9Hw20kGCaZ8rWghz9W3bc645ys1vPQpQW28CD9w3B +cTMBALsV1xpS2pGwTfn1PUimqESZfTrREmNvOjKSQwe0yicI +=D4lm +-----END PGP PUBLIC KEY BLOCK----- \ No newline at end of file diff --git a/src/RandomX/src/allocator.cpp b/src/RandomX/src/allocator.cpp index 6b48a7e70..bcee0f6b6 100644 --- a/src/RandomX/src/allocator.cpp +++ b/src/RandomX/src/allocator.cpp @@ -29,7 +29,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "allocator.hpp" #include "intrin_portable.h" -#include "virtual_memory.hpp" +#include "virtual_memory.h" #include "common.hpp" namespace randomx { diff --git a/src/RandomX/src/assembly_generator_x86.cpp b/src/RandomX/src/assembly_generator_x86.cpp index e7e5258b7..1ce31dd55 100644 --- a/src/RandomX/src/assembly_generator_x86.cpp +++ b/src/RandomX/src/assembly_generator_x86.cpp @@ -445,7 +445,7 @@ namespace randomx { } void AssemblyGeneratorX86::h_IMUL_RCP(Instruction& instr, int i) { - uint64_t divisor = instr.getImm32(); + const uint32_t divisor = instr.getImm32(); if (!isZeroOrPowerOf2(divisor)) { registerUsage[instr.dst] = i; asmCode << "\tmov rax, " << randomx_reciprocal(divisor) << std::endl; diff --git a/src/RandomX/src/bytecode_machine.cpp b/src/RandomX/src/bytecode_machine.cpp index 7d8e902d2..1d00d0959 100644 --- a/src/RandomX/src/bytecode_machine.cpp +++ b/src/RandomX/src/bytecode_machine.cpp @@ -243,7 +243,7 @@ namespace randomx { } if (opcode < ceil_IMUL_RCP) { - uint64_t divisor = instr.getImm32(); + const uint32_t divisor = instr.getImm32(); if (!isZeroOrPowerOf2(divisor)) { auto dst = instr.dst % RegistersCount; ibc.type = InstructionType::IMUL_R; diff --git a/src/RandomX/src/common.hpp b/src/RandomX/src/common.hpp index a77feb3bf..f4b85342a 100644 --- a/src/RandomX/src/common.hpp +++ b/src/RandomX/src/common.hpp @@ -116,12 +116,19 @@ namespace randomx { #if defined(_M_X64) || defined(__x86_64__) #define RANDOMX_HAVE_COMPILER 1 + #define RANDOMX_COMPILER_X86 class JitCompilerX86; using JitCompiler = JitCompilerX86; #elif defined(__aarch64__) #define RANDOMX_HAVE_COMPILER 1 + #define RANDOMX_COMPILER_A64 class JitCompilerA64; using JitCompiler = JitCompilerA64; +#elif defined(__riscv) && __riscv_xlen == 64 + #define RANDOMX_HAVE_COMPILER 1 + #define RANDOMX_COMPILER_RV64 + class JitCompilerRV64; + using JitCompiler = JitCompilerRV64; #else #define RANDOMX_HAVE_COMPILER 0 class JitCompilerFallback; diff --git a/src/RandomX/src/configuration.h b/src/RandomX/src/configuration.h index f74a74a4c..84400ddce 100644 --- a/src/RandomX/src/configuration.h +++ b/src/RandomX/src/configuration.h @@ -32,13 +32,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define RANDOMX_ARGON_MEMORY 262144 //Number of Argon2d iterations for Cache initialization. -#define RANDOMX_ARGON_ITERATIONS 5 +#define RANDOMX_ARGON_ITERATIONS 3 //Number of parallel lanes for Cache initialization. #define RANDOMX_ARGON_LANES 1 //Argon2d salt -#define RANDOMX_ARGON_SALT "RandomXHUSH\x03" +#define RANDOMX_ARGON_SALT "RandomX\x03" //Number of random Cache accesses per Dataset item. Minimum is 2. #define RANDOMX_CACHE_ACCESSES 8 @@ -53,13 +53,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define RANDOMX_DATASET_EXTRA_SIZE 33554368 //Number of instructions in a RandomX program. Must be divisible by 8. -#define RANDOMX_PROGRAM_SIZE 512 +#define RANDOMX_PROGRAM_SIZE 256 //Number of iterations during VM execution. -#define RANDOMX_PROGRAM_ITERATIONS 4096 +#define RANDOMX_PROGRAM_ITERATIONS 2048 //Number of chained VM executions per hash. -#define RANDOMX_PROGRAM_COUNT 16 +#define RANDOMX_PROGRAM_COUNT 8 //Scratchpad L3 size in bytes. Must be a power of 2. #define RANDOMX_SCRATCHPAD_L3 2097152 diff --git a/src/RandomX/src/dataset.cpp b/src/RandomX/src/dataset.cpp index 675c5abc5..7ebf1bca4 100644 --- a/src/RandomX/src/dataset.cpp +++ b/src/RandomX/src/dataset.cpp @@ -42,7 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.hpp" #include "dataset.hpp" -#include "virtual_memory.hpp" +#include "virtual_memory.h" #include "superscalar.hpp" #include "blake2_generator.hpp" #include "reciprocal.h" diff --git a/src/RandomX/src/intrin_portable.h b/src/RandomX/src/intrin_portable.h index 8c09ae885..50020c3e2 100644 --- a/src/RandomX/src/intrin_portable.h +++ b/src/RandomX/src/intrin_portable.h @@ -349,7 +349,7 @@ FORCE_INLINE rx_vec_i128 rx_load_vec_i128(rx_vec_i128 const *p) { #if defined(NATIVE_LITTLE_ENDIAN) return *p; #else - uint32_t* ptr = (uint32_t*)p; + const uint32_t* ptr = (const uint32_t*)p; vec_u c; c.u32[0] = load32(ptr + 0); c.u32[1] = load32(ptr + 1); @@ -375,8 +375,8 @@ FORCE_INLINE void rx_store_vec_i128(rx_vec_i128 *p, rx_vec_i128 b) { FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) { vec_u x; - x.d64[0] = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 0)); - x.d64[1] = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 4)); + x.d64[0] = (double)unsigned32ToSigned2sCompl(load32((const uint8_t*)addr + 0)); + x.d64[1] = (double)unsigned32ToSigned2sCompl(load32((const uint8_t*)addr + 4)); return (rx_vec_f128)x.d; } @@ -684,7 +684,7 @@ FORCE_INLINE rx_vec_i128 rx_load_vec_i128(rx_vec_i128 const* p) { #if defined(NATIVE_LITTLE_ENDIAN) return *p; #else - uint32_t* ptr = (uint32_t*)p; + const uint32_t* ptr = (const uint32_t*)p; rx_vec_i128 c; c.u32[0] = load32(ptr + 0); c.u32[1] = load32(ptr + 1); @@ -708,8 +708,8 @@ FORCE_INLINE void rx_store_vec_i128(rx_vec_i128 *p, rx_vec_i128 b) { FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) { rx_vec_f128 x; - x.lo = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 0)); - x.hi = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 4)); + x.lo = (double)unsigned32ToSigned2sCompl(load32((const uint8_t*)addr + 0)); + x.hi = (double)unsigned32ToSigned2sCompl(load32((const uint8_t*)addr + 4)); return x; } diff --git a/src/RandomX/src/jit_compiler.hpp b/src/RandomX/src/jit_compiler.hpp index 17fdad4e3..5b76fa5f9 100644 --- a/src/RandomX/src/jit_compiler.hpp +++ b/src/RandomX/src/jit_compiler.hpp @@ -28,10 +28,48 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma once -#if defined(_M_X64) || defined(__x86_64__) +#include "common.hpp" + +namespace randomx { + + struct CodeBuffer { + uint8_t* code; + int32_t codePos; + int32_t rcpCount; + + void emit(const uint8_t* src, int32_t len) { + memcpy(&code[codePos], src, len); + codePos += len; + } + + template + void emit(T src) { + memcpy(&code[codePos], &src, sizeof(src)); + codePos += sizeof(src); + } + + void emitAt(int32_t codePos, const uint8_t* src, int32_t len) { + memcpy(&code[codePos], src, len); + } + + template + void emitAt(int32_t codePos, T src) { + memcpy(&code[codePos], &src, sizeof(src)); + } + }; + + struct CompilerState : public CodeBuffer { + int32_t instructionOffsets[RANDOMX_PROGRAM_SIZE]; + int registerUsage[RegistersCount]; + }; +} + +#if defined(RANDOMX_COMPILER_X86) #include "jit_compiler_x86.hpp" -#elif defined(__aarch64__) +#elif defined(RANDOMX_COMPILER_A64) #include "jit_compiler_a64.hpp" +#elif defined(RANDOMX_COMPILER_RV64) +#include "jit_compiler_rv64.hpp" #else #include "jit_compiler_fallback.hpp" #endif diff --git a/src/RandomX/src/jit_compiler_a64.cpp b/src/RandomX/src/jit_compiler_a64.cpp index fc4634868..5be8f6e42 100644 --- a/src/RandomX/src/jit_compiler_a64.cpp +++ b/src/RandomX/src/jit_compiler_a64.cpp @@ -31,7 +31,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "superscalar.hpp" #include "program.hpp" #include "reciprocal.h" -#include "virtual_memory.hpp" +#include "virtual_memory.h" namespace ARMV8A { @@ -130,8 +130,8 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con // and w16, w10, ScratchpadL3Mask64 emit32(0x121A0000 | 16 | (10 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos); - // and w17, w18, ScratchpadL3Mask64 - emit32(0x121A0000 | 17 | (18 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos); + // and w17, w20, ScratchpadL3Mask64 + emit32(0x121A0000 | 17 | (20 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos); codePos = PrologueSize; literalPos = ImulRcpLiteralsEnd; @@ -149,16 +149,16 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con } // Update spMix2 - // eor w18, config.readReg2, config.readReg3 - emit32(ARMV8A::EOR32 | 18 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos); + // eor w20, config.readReg2, config.readReg3 + emit32(ARMV8A::EOR32 | 20 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos); // Jump back to the main loop const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64)) - codePos; emit32(ARMV8A::B | (offset / 4), code, codePos); - // and w18, w18, CacheLineAlignMask + // and w20, w20, CacheLineAlignMask codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask1) - ((uint8_t*)randomx_program_aarch64)); - emit32(0x121A0000 | 18 | (18 << 5) | ((Log2(RANDOMX_DATASET_BASE_SIZE) - 7) << 10), code, codePos); + emit32(0x121A0000 | 20 | (20 << 5) | ((Log2(RANDOMX_DATASET_BASE_SIZE) - 7) << 10), code, codePos); // and w10, w10, CacheLineAlignMask codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask2) - ((uint8_t*)randomx_program_aarch64)); @@ -181,8 +181,8 @@ void JitCompilerA64::generateProgramLight(Program& program, ProgramConfiguration // and w16, w10, ScratchpadL3Mask64 emit32(0x121A0000 | 16 | (10 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos); - // and w17, w18, ScratchpadL3Mask64 - emit32(0x121A0000 | 17 | (18 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos); + // and w17, w20, ScratchpadL3Mask64 + emit32(0x121A0000 | 17 | (20 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos); codePos = PrologueSize; literalPos = ImulRcpLiteralsEnd; @@ -200,8 +200,8 @@ void JitCompilerA64::generateProgramLight(Program& program, ProgramConfiguration } // Update spMix2 - // eor w18, config.readReg2, config.readReg3 - emit32(ARMV8A::EOR32 | 18 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos); + // eor w20, config.readReg2, config.readReg3 + emit32(ARMV8A::EOR32 | 20 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos); // Jump back to the main loop const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end_light) - ((uint8_t*)randomx_program_aarch64)) - codePos; @@ -434,7 +434,7 @@ void JitCompilerA64::emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm, } else { - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emitMovImmediate(tmp_reg, imm, code, k); // add dst, src, tmp_reg @@ -483,7 +483,7 @@ void JitCompilerA64::emitMemLoadFP(uint32_t src, Instruction& instr, uint8_t* co uint32_t k = codePos; uint32_t imm = instr.getImm32(); - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 19; imm &= instr.getModMem() ? (RANDOMX_SCRATCHPAD_L1 - 1) : (RANDOMX_SCRATCHPAD_L2 - 1); emitAddImmediate(tmp_reg, src, imm, code, k); @@ -537,7 +537,7 @@ void JitCompilerA64::h_IADD_M(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emitMemLoad(dst, src, instr, code, k); // add dst, dst, tmp_reg @@ -575,7 +575,7 @@ void JitCompilerA64::h_ISUB_M(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emitMemLoad(dst, src, instr, code, k); // sub dst, dst, tmp_reg @@ -594,7 +594,7 @@ void JitCompilerA64::h_IMUL_R(Instruction& instr, uint32_t& codePos) if (src == dst) { - src = 18; + src = 20; emitMovImmediate(src, instr.getImm32(), code, k); } @@ -612,7 +612,7 @@ void JitCompilerA64::h_IMUL_M(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emitMemLoad(dst, src, instr, code, k); // sub dst, dst, tmp_reg @@ -643,7 +643,7 @@ void JitCompilerA64::h_IMULH_M(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emitMemLoad(dst, src, instr, code, k); // umulh dst, dst, tmp_reg @@ -674,7 +674,7 @@ void JitCompilerA64::h_ISMULH_M(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emitMemLoad(dst, src, instr, code, k); // smulh dst, dst, tmp_reg @@ -686,34 +686,24 @@ void JitCompilerA64::h_ISMULH_M(Instruction& instr, uint32_t& codePos) void JitCompilerA64::h_IMUL_RCP(Instruction& instr, uint32_t& codePos) { - const uint64_t divisor = instr.getImm32(); + const uint32_t divisor = instr.getImm32(); if (isZeroOrPowerOf2(divisor)) return; uint32_t k = codePos; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint64_t N = 1ULL << 63; - const uint64_t q = N / divisor; - const uint64_t r = N % divisor; -#ifdef __GNUC__ - const uint64_t shift = 64 - __builtin_clzll(divisor); -#else - uint64_t shift = 32; - for (uint64_t k = 1U << 31; (k & divisor) == 0; k >>= 1) - --shift; -#endif - const uint32_t literal_id = (ImulRcpLiteralsEnd - literalPos) / sizeof(uint64_t); - literalPos -= sizeof(uint64_t); - *(uint64_t*)(code + literalPos) = (q << shift) + ((r << shift) / divisor); - if (literal_id < 13) + const uint64_t reciprocal = randomx_reciprocal_fast(divisor); + memcpy(code + literalPos, &reciprocal, sizeof(reciprocal)); + + if (literal_id < 12) { - static constexpr uint32_t literal_regs[13] = { 30 << 16, 29 << 16, 28 << 16, 27 << 16, 26 << 16, 25 << 16, 24 << 16, 23 << 16, 22 << 16, 21 << 16, 20 << 16, 11 << 16, 0 }; + static constexpr uint32_t literal_regs[12] = { 30 << 16, 29 << 16, 28 << 16, 27 << 16, 26 << 16, 25 << 16, 24 << 16, 23 << 16, 22 << 16, 21 << 16, 11 << 16, 0 }; // mul dst, dst, literal_reg emit32(ARMV8A::MUL | dst | (dst << 5) | literal_regs[literal_id], code, k); @@ -751,7 +741,7 @@ void JitCompilerA64::h_IXOR_R(Instruction& instr, uint32_t& codePos) if (src == dst) { - src = 18; + src = 20; emitMovImmediate(src, instr.getImm32(), code, k); } @@ -769,7 +759,7 @@ void JitCompilerA64::h_IXOR_M(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emitMemLoad(dst, src, instr, code, k); // eor dst, dst, tmp_reg @@ -807,7 +797,7 @@ void JitCompilerA64::h_IROL_R(Instruction& instr, uint32_t& codePos) if (src != dst) { - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; // sub tmp_reg, xzr, src emit32(ARMV8A::SUB | tmp_reg | (31 << 5) | (src << 16), code, k); @@ -835,7 +825,7 @@ void JitCompilerA64::h_ISWAP_R(Instruction& instr, uint32_t& codePos) uint32_t k = codePos; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emit32(ARMV8A::MOV_REG | tmp_reg | (dst << 16), code, k); emit32(ARMV8A::MOV_REG | dst | (src << 16), code, k); emit32(ARMV8A::MOV_REG | src | (tmp_reg << 16), code, k); @@ -984,7 +974,7 @@ void JitCompilerA64::h_CFROUND(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; constexpr uint32_t fpcr_tmp_reg = 8; // ror tmp_reg, src, imm @@ -1008,7 +998,7 @@ void JitCompilerA64::h_ISTORE(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; uint32_t imm = instr.getImm32(); diff --git a/src/RandomX/src/jit_compiler_a64.hpp b/src/RandomX/src/jit_compiler_a64.hpp index a02824ffb..f8484c083 100644 --- a/src/RandomX/src/jit_compiler_a64.hpp +++ b/src/RandomX/src/jit_compiler_a64.hpp @@ -81,7 +81,7 @@ namespace randomx { static void emit64(uint64_t val, uint8_t* code, uint32_t& codePos) { - *(uint64_t*)(code + codePos) = val; + memcpy(code + codePos, &val, sizeof(val)); codePos += sizeof(val); } diff --git a/src/RandomX/src/jit_compiler_a64_static.S b/src/RandomX/src/jit_compiler_a64_static.S index 4886fcf3c..bc146133a 100644 --- a/src/RandomX/src/jit_compiler_a64_static.S +++ b/src/RandomX/src/jit_compiler_a64_static.S @@ -74,9 +74,9 @@ # x15 -> "r7" # x16 -> spAddr0 # x17 -> spAddr1 -# x18 -> temporary +# x18 -> unused (platform register, don't touch it) # x19 -> temporary -# x20 -> literal for IMUL_RCP +# x20 -> temporary # x21 -> literal for IMUL_RCP # x22 -> literal for IMUL_RCP # x23 -> literal for IMUL_RCP @@ -111,7 +111,7 @@ DECL(randomx_program_aarch64): # Save callee-saved registers sub sp, sp, 192 stp x16, x17, [sp] - stp x18, x19, [sp, 16] + str x19, [sp, 16] stp x20, x21, [sp, 32] stp x22, x23, [sp, 48] stp x24, x25, [sp, 64] @@ -166,7 +166,6 @@ DECL(randomx_program_aarch64): # Read literals ldr x0, literal_x0 ldr x11, literal_x11 - ldr x20, literal_x20 ldr x21, literal_x21 ldr x22, literal_x22 ldr x23, literal_x23 @@ -198,11 +197,11 @@ DECL(randomx_program_aarch64): DECL(randomx_program_aarch64_main_loop): # spAddr0 = spMix1 & ScratchpadL3Mask64; # spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64; - lsr x18, x10, 32 + lsr x20, x10, 32 # Actual mask will be inserted by JIT compiler and w16, w10, 1 - and w17, w18, 1 + and w17, w20, 1 # x16 = scratchpad + spAddr0 # x17 = scratchpad + spAddr1 @@ -210,31 +209,31 @@ DECL(randomx_program_aarch64_main_loop): add x17, x17, x2 # xor integer registers with scratchpad data (spAddr0) - ldp x18, x19, [x16] - eor x4, x4, x18 + ldp x20, x19, [x16] + eor x4, x4, x20 eor x5, x5, x19 - ldp x18, x19, [x16, 16] - eor x6, x6, x18 + ldp x20, x19, [x16, 16] + eor x6, x6, x20 eor x7, x7, x19 - ldp x18, x19, [x16, 32] - eor x12, x12, x18 + ldp x20, x19, [x16, 32] + eor x12, x12, x20 eor x13, x13, x19 - ldp x18, x19, [x16, 48] - eor x14, x14, x18 + ldp x20, x19, [x16, 48] + eor x14, x14, x20 eor x15, x15, x19 # Load group F registers (spAddr1) - ldpsw x18, x19, [x17] - ins v16.d[0], x18 + ldpsw x20, x19, [x17] + ins v16.d[0], x20 ins v16.d[1], x19 - ldpsw x18, x19, [x17, 8] - ins v17.d[0], x18 + ldpsw x20, x19, [x17, 8] + ins v17.d[0], x20 ins v17.d[1], x19 - ldpsw x18, x19, [x17, 16] - ins v18.d[0], x18 + ldpsw x20, x19, [x17, 16] + ins v18.d[0], x20 ins v18.d[1], x19 - ldpsw x18, x19, [x17, 24] - ins v19.d[0], x18 + ldpsw x20, x19, [x17, 24] + ins v19.d[0], x20 ins v19.d[1], x19 scvtf v16.2d, v16.2d scvtf v17.2d, v17.2d @@ -242,17 +241,17 @@ DECL(randomx_program_aarch64_main_loop): scvtf v19.2d, v19.2d # Load group E registers (spAddr1) - ldpsw x18, x19, [x17, 32] - ins v20.d[0], x18 + ldpsw x20, x19, [x17, 32] + ins v20.d[0], x20 ins v20.d[1], x19 - ldpsw x18, x19, [x17, 40] - ins v21.d[0], x18 + ldpsw x20, x19, [x17, 40] + ins v21.d[0], x20 ins v21.d[1], x19 - ldpsw x18, x19, [x17, 48] - ins v22.d[0], x18 + ldpsw x20, x19, [x17, 48] + ins v22.d[0], x20 ins v22.d[1], x19 - ldpsw x18, x19, [x17, 56] - ins v23.d[0], x18 + ldpsw x20, x19, [x17, 56] + ins v23.d[0], x20 ins v23.d[1], x19 scvtf v20.2d, v20.2d scvtf v21.2d, v21.2d @@ -276,7 +275,6 @@ DECL(randomx_program_aarch64_vm_instructions): literal_x0: .fill 1,8,0 literal_x11: .fill 1,8,0 -literal_x20: .fill 1,8,0 literal_x21: .fill 1,8,0 literal_x22: .fill 1,8,0 literal_x23: .fill 1,8,0 @@ -312,17 +310,17 @@ DECL(randomx_program_aarch64_vm_instructions_end): lsr x10, x9, 32 # mx ^= r[readReg2] ^ r[readReg3]; - eor x9, x9, x18 + eor x9, x9, x20 # Calculate dataset pointer for dataset prefetch - mov w18, w9 + mov w20, w9 DECL(randomx_program_aarch64_cacheline_align_mask1): # Actual mask will be inserted by JIT compiler - and x18, x18, 1 - add x18, x18, x1 + and x20, x20, 1 + add x20, x20, x1 # Prefetch dataset data - prfm pldl2strm, [x18] + prfm pldl2strm, [x20] # mx <-> ma ror x9, x9, 32 @@ -335,17 +333,17 @@ DECL(randomx_program_aarch64_cacheline_align_mask2): DECL(randomx_program_aarch64_xor_with_dataset_line): rx_program_xor_with_dataset_line: # xor integer registers with dataset data - ldp x18, x19, [x10] - eor x4, x4, x18 + ldp x20, x19, [x10] + eor x4, x4, x20 eor x5, x5, x19 - ldp x18, x19, [x10, 16] - eor x6, x6, x18 + ldp x20, x19, [x10, 16] + eor x6, x6, x20 eor x7, x7, x19 - ldp x18, x19, [x10, 32] - eor x12, x12, x18 + ldp x20, x19, [x10, 32] + eor x12, x12, x20 eor x13, x13, x19 - ldp x18, x19, [x10, 48] - eor x14, x14, x18 + ldp x20, x19, [x10, 48] + eor x14, x14, x20 eor x15, x15, x19 DECL(randomx_program_aarch64_update_spMix1): @@ -388,7 +386,7 @@ DECL(randomx_program_aarch64_update_spMix1): # Restore callee-saved registers ldp x16, x17, [sp] - ldp x18, x19, [sp, 16] + ldr x19, [sp, 16] ldp x20, x21, [sp, 32] ldp x22, x23, [sp, 48] ldp x24, x25, [sp, 64] @@ -409,7 +407,7 @@ DECL(randomx_program_aarch64_vm_instructions_end_light): stp x2, x30, [sp, 80] # mx ^= r[readReg2] ^ r[readReg3]; - eor x9, x9, x18 + eor x9, x9, x20 # mx <-> ma ror x9, x9, 32 @@ -451,8 +449,8 @@ DECL(randomx_program_aarch64_light_dataset_offset): # x3 -> end item DECL(randomx_init_dataset_aarch64): - # Save x30 (return address) - str x30, [sp, -16]! + # Save x20 (used as temporary, but must be saved to not break ABI) and x30 (return address) + stp x20, x30, [sp, -16]! # Load pointer to cache memory ldr x0, [x0] @@ -464,8 +462,8 @@ DECL(randomx_init_dataset_aarch64_main_loop): cmp x2, x3 bne DECL(randomx_init_dataset_aarch64_main_loop) - # Restore x30 (return address) - ldr x30, [sp], 16 + # Restore x20 and x30 + ldp x20, x30, [sp], 16 ret diff --git a/src/RandomX/src/jit_compiler_rv64.cpp b/src/RandomX/src/jit_compiler_rv64.cpp new file mode 100644 index 000000000..6f0842e5f --- /dev/null +++ b/src/RandomX/src/jit_compiler_rv64.cpp @@ -0,0 +1,1175 @@ +/* +Copyright (c) 2023 tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include +#include +#include "jit_compiler_rv64.hpp" +#include "jit_compiler_rv64_static.hpp" +#include "superscalar.hpp" +#include "program.hpp" +#include "reciprocal.h" +#include "virtual_memory.h" + + +namespace { +#define HANDLER_ARGS randomx::CompilerState& state, randomx::Instruction isn, int i + using InstructionHandler = void(HANDLER_ARGS); + extern InstructionHandler* opcodeMap1[256]; +} + +namespace rv64 { + constexpr uint16_t C_LUI = 0x6001; + constexpr uint32_t LUI = 0x00000037; + constexpr uint16_t C_ADDI = 0x0001; + constexpr uint32_t ADDI = 0x00000013; + constexpr uint32_t ADDIW = 0x0000001b; + constexpr uint16_t C_ADD = 0x9002; + constexpr uint32_t ADD = 0x00000033; + constexpr uint32_t SHXADD = 0x20000033; //Zba + constexpr uint32_t SLL = 0x00001033; + constexpr uint32_t SRL = 0x00005033; + constexpr uint32_t SLLI = 0x00001013; + constexpr uint32_t C_SLLI = 0x0002; + constexpr uint32_t SRLI = 0x00005013; + constexpr uint32_t AND = 0x00007033; + constexpr uint32_t ANDI = 0x00007013; + constexpr uint16_t C_AND = 0x8c61; + constexpr uint16_t C_ANDI = 0x8801; + constexpr uint32_t OR = 0x00006033; + constexpr uint16_t C_OR = 0x8c41; + constexpr uint32_t XOR = 0x00004033; + constexpr uint16_t C_XOR = 0x8c21; + constexpr uint32_t LD = 0x00003003; + constexpr uint16_t C_LD = 0x6000; + constexpr uint16_t C_LW = 0x4000; + constexpr uint32_t SD = 0x00003023; + constexpr uint32_t SUB = 0x40000033; + constexpr uint16_t C_SUB = 0x8c01; + constexpr uint32_t MUL = 0x02000033; + constexpr uint32_t MULHU = 0x02003033; + constexpr uint32_t MULH = 0x02001033; + constexpr uint16_t C_MV = 0x8002; + constexpr uint32_t ROR = 0x60005033; //Zbb + constexpr uint32_t RORI = 0x60005013; //Zbb + constexpr uint32_t ROL = 0x60001033; //Zbb + constexpr uint32_t FMV_X_D = 0xe2000053; + constexpr uint32_t FMV_D_X = 0xf2000053; + constexpr uint32_t FMV_D = 0x22000053; + constexpr uint32_t FADD_D = 0x02007053; + constexpr uint32_t FSUB_D = 0x0a007053; + constexpr uint32_t FMUL_D = 0x12007053; + constexpr uint32_t FDIV_D = 0x1a007053; + constexpr uint32_t FSQRT_D = 0x5a007053; + constexpr uint32_t FCVT_D_W = 0xd2000053; + constexpr uint32_t FSRM = 0x00201073; + constexpr uint16_t C_BEQZ = 0xc001; + constexpr uint32_t BEQ = 0x00000063; + constexpr uint16_t C_BNEZ = 0xe001; + constexpr uint32_t JAL = 0x0000006f; + constexpr uint16_t C_RET = 0x8082; +} + +namespace randomx { + + constexpr size_t MaxRandomXInstrCodeSize = 56; //FDIV_M requires 56 bytes of rv64 code + constexpr size_t MaxSuperscalarInstrSize = 12; //IXOR_C requires 12 bytes of rv64 code + constexpr size_t SuperscalarProgramHeader = 136; //overhead per superscalar program + constexpr size_t CodeAlign = 4096; //align code size to a multiple of 4 KiB + constexpr size_t LiteralPoolSize = CodeAlign; + constexpr size_t SuperscalarLiteraPoolSize = RANDOMX_CACHE_ACCESSES * CodeAlign; + constexpr size_t ReserveCodeSize = CodeAlign; //prologue, epilogue + reserve + + constexpr size_t RandomXCodeSize = alignSize(LiteralPoolSize + ReserveCodeSize + MaxRandomXInstrCodeSize * RANDOMX_PROGRAM_SIZE, CodeAlign); + constexpr size_t SuperscalarSize = alignSize(SuperscalarLiteraPoolSize + ReserveCodeSize + (SuperscalarProgramHeader + MaxSuperscalarInstrSize * SuperscalarMaxSize) * RANDOMX_CACHE_ACCESSES, CodeAlign); + + static_assert(RandomXCodeSize < INT32_MAX / 2, "RandomXCodeSize is too large"); + static_assert(SuperscalarSize < INT32_MAX / 2, "SuperscalarSize is too large"); + + constexpr uint32_t CodeSize = RandomXCodeSize + SuperscalarSize; + constexpr uint32_t ExecutableSize = CodeSize - LiteralPoolSize; + + constexpr int32_t LiteralPoolOffset = LiteralPoolSize / 2; + constexpr int32_t SuperScalarLiteralPoolOffset = RandomXCodeSize; + constexpr int32_t SuperScalarLiteralPoolRefOffset = RandomXCodeSize + (RANDOMX_CACHE_ACCESSES - 1) * LiteralPoolSize + LiteralPoolOffset; + constexpr int32_t SuperScalarHashOffset = SuperScalarLiteralPoolOffset + SuperscalarLiteraPoolSize; + + constexpr int maskLog2(uint32_t x, int prev) { + return x == 1 ? prev : maskLog2(x >> 1, prev + 1); + } + + constexpr int32_t unsigned32ToSigned2sCompl(uint32_t x) { + return (-1 == ~0) ? (int32_t)x : (x > INT32_MAX ? (-(int32_t)(UINT32_MAX - x) - 1) : (int32_t)x); + } + + constexpr int MaskL1Shift = 32 - maskLog2(RANDOMX_SCRATCHPAD_L1, 0); + constexpr int MaskL2Shift = 32 - maskLog2(RANDOMX_SCRATCHPAD_L2, 0); + constexpr int MaskL3Shift = 32 - maskLog2(RANDOMX_SCRATCHPAD_L3, 0); + + constexpr int RcpLiteralsOffset = 144; + + constexpr int LiteralPoolReg = 3; //x3 + constexpr int SpadReg = 5; //x5 + constexpr int DataReg = 6; //x6 + constexpr int SuperscalarReg = 7; //x7 + constexpr int SshTmp1Reg = 28; //x28 + constexpr int SshTmp2Reg = 29; //x29 + constexpr int SshPoolReg = 30; //x30 + constexpr int SshRcpReg = 31; //x31 + constexpr int Tmp1Reg = 8; //x8 + constexpr int Tmp2Reg = 9; //x9 + constexpr int Tmp1RegF = 24; //f24 + constexpr int Tmp2RegF = 25; //f25 + constexpr int MaskL1Reg = 10; //x10 + constexpr int MaskL2Reg = 11; //x11 + constexpr int MaskFscalReg = 12; //x12 + constexpr int MaskEclear = 13; //x13 + constexpr int MaskEsetLo = 14; //x14 + constexpr int MaskEsetHi = 15; //x15 + constexpr int MaskL3Reg = 1; //x1 + constexpr int ReturnReg = 1; //x1 + constexpr int SpAddr0Reg = 26; //x26 + constexpr int OffsetXC = -8; //x8-x15 + constexpr int OffsetR = 16; //x16-x23 + constexpr int OffsetF = 0; //f0-f7 + constexpr int OffsetE = 8; //f8-f15 + constexpr int OffsetA = 16; //f16-f23 + constexpr int OffsetRcp = 28; //x28-x31 + constexpr int OffsetRcpF = 22; //f26-f31 + constexpr int OffsetSsh = 8; //x8-x15 + + //destination register (bit 7+) + constexpr int rvrd(int reg) { + return reg << 7; + } + + //first source register (bit 15+) + constexpr int rvrs1(int reg) { + return reg << 15; + } + + //second source register (bit 20+) + constexpr int rvrs2(int reg) { + return reg << 20; + } + + //compressed source register (bit 2+) + constexpr int rvcrs(int reg) { + return reg << 2; + } + + //base instruction: {op} x{rd}, x{rs1}, x{rs2} + constexpr uint32_t rvi(uint32_t op, int rd, int rs1, int rs2 = 0) { + return op | rvrs2(rs2) | rvrs1(rs1) | rvrd(rd); + } + + //compressed instruction: op x{rd}, x{rs} + constexpr uint16_t rvc(uint16_t op, int rd, int rs) { + return op | rvrd(rd) | rvcrs(rs); + } + + //compressed instruction: op x{rd}, imm6 + constexpr uint16_t rvc(uint16_t op, int imm5, int rd, int imm40) { + return op | (imm5 << 12) | rvrd(rd) | (imm40 << 2); + } + + constexpr int regR(int reg) { + return reg + OffsetR; + } + + constexpr int regLoA(int reg) { + return 2 * reg + OffsetA; + } + + constexpr int regHiA(int reg) { + return 2 * reg + OffsetA + 1; + } + + constexpr int regLoF(int reg) { + return 2 * reg + OffsetF; + } + + constexpr int regHiF(int reg) { + return 2 * reg + OffsetF + 1; + } + + constexpr int regLoE(int reg) { + return 2 * reg + OffsetE; + } + + constexpr int regHiE(int reg) { + return 2 * reg + OffsetE + 1; + } + + constexpr int regRcp(int reg) { + return reg + OffsetRcp; + } + + constexpr int regRcpF(int reg) { + return reg + OffsetRcpF; + } + + constexpr int regSS(int reg) { + return reg + OffsetSsh; + } + + static const uint8_t* codeLiterals = (uint8_t*)&randomx_riscv64_literals; + static const uint8_t* codeLiteralsEnd = (uint8_t*)&randomx_riscv64_literals_end; + static const uint8_t* codeDataInit = (uint8_t*)&randomx_riscv64_data_init; + static const uint8_t* codeFixDataCall = (uint8_t*)&randomx_riscv64_fix_data_call; + static const uint8_t* codePrologue = (uint8_t*)&randomx_riscv64_prologue; + static const uint8_t* codeLoopBegin = (uint8_t*)&randomx_riscv64_loop_begin; + static const uint8_t* codeDataRead = (uint8_t*)&randomx_riscv64_data_read; + static const uint8_t* codeDataReadLight = (uint8_t*)&randomx_riscv64_data_read_light; + static const uint8_t* codeFixLoopCall = (uint8_t*)&randomx_riscv64_fix_loop_call; + static const uint8_t* codeSpadStore = (uint8_t*)&randomx_riscv64_spad_store; + static const uint8_t* codeSpadStoreHardAes = (uint8_t*)&randomx_riscv64_spad_store_hardaes; + static const uint8_t* codeSpadStoreSoftAes = (uint8_t*)&randomx_riscv64_spad_store_softaes; + static const uint8_t* codeLoopEnd = (uint8_t*)&randomx_riscv64_loop_end; + static const uint8_t* codeFixContinueLoop = (uint8_t*)&randomx_riscv64_fix_continue_loop; + static const uint8_t* codeEpilogue = (uint8_t*)&randomx_riscv64_epilogue; + static const uint8_t* codeSoftAes = (uint8_t*)&randomx_riscv64_softaes; + static const uint8_t* codeProgramEnd = (uint8_t*)&randomx_riscv64_program_end; + static const uint8_t* codeSshInit = (uint8_t*)&randomx_riscv64_ssh_init; + static const uint8_t* codeSshLoad = (uint8_t*)&randomx_riscv64_ssh_load; + static const uint8_t* codeSshPrefetch = (uint8_t*)&randomx_riscv64_ssh_prefetch; + static const uint8_t* codeSshEnd = (uint8_t*)&randomx_riscv64_ssh_end; + + static const int32_t sizeLiterals = codeLiteralsEnd - codeLiterals; + static const int32_t sizeDataInit = codePrologue - codeDataInit; + static const int32_t sizePrologue = codeLoopBegin - codePrologue; + static const int32_t sizeLoopBegin = codeDataRead - codeLoopBegin; + static const int32_t sizeDataRead = codeDataReadLight - codeDataRead; + static const int32_t sizeDataReadLight = codeSpadStore - codeDataReadLight; + static const int32_t sizeSpadStore = codeSpadStoreHardAes - codeSpadStore; + static const int32_t sizeSpadStoreSoftAes = codeLoopEnd - codeSpadStoreSoftAes; + static const int32_t sizeLoopEnd = codeEpilogue - codeLoopEnd; + static const int32_t sizeEpilogue = codeSoftAes - codeEpilogue; + static const int32_t sizeSoftAes = codeProgramEnd - codeSoftAes; + static const int32_t sizeSshInit = codeSshLoad - codeSshInit; + static const int32_t sizeSshLoad = codeSshPrefetch - codeSshLoad; + static const int32_t sizeSshPrefetch = codeSshEnd - codeSshPrefetch; + + static const int32_t offsetFixDataCall = codeFixDataCall - codeDataInit; + static const int32_t offsetFixLoopCall = codeFixLoopCall - codeDataReadLight; + static const int32_t offsetFixContinueLoop = codeFixContinueLoop - codeLoopEnd; + + static const int32_t LoopTopPos = LiteralPoolSize + sizeDataInit + sizePrologue; + static const int32_t RandomXCodePos = LoopTopPos + sizeLoopBegin; + + static void clearCache(CodeBuffer& buf) { +#ifdef __GNUC__ + __builtin___clear_cache((char*)buf.code, (char*)(buf.code + CodeSize)); +#endif + } + + //emits code to calculate: x{dst} = x{src} + {imm32} + //takes 1-3 isns, 2-10 bytes + static void emitImm32(CodeBuffer& buf, int32_t imm, int dst, int src = 0, int tmp = 0) { + + //lower 12 bits + int32_t limm = (imm << 20) >> 20; + //upper 20 bits + int32_t uimm = (imm >> 12) + (limm < 0); + + //If there are no upper bits, the whole thing + //can be done with a single instruction. + if (uimm == 0) { + //addi x{dst}, x{src}, {limm} + buf.emit(rvi(rv64::ADDI, dst, src, limm)); + return; + } + + //dst1 is the register where imm will be materialized + int dst1 = src != dst ? dst : tmp; + assert(dst1 != 0); + //src1 is the register that will be added to the result + int src1 = src != dst ? src : dst1; + + //load upper bits + if (uimm >= -32 && uimm <= 31) { + //c.lui x{dst1}, {uimm} + buf.emit(rvc(rv64::C_LUI, (uimm < 0), dst1, (uimm & 31))); + } + else { + //lui x{dst1}, {uimm} + buf.emit(rv64::LUI | (uimm << 12) | rvrd(dst1)); + } + //load lower bits + if (limm != 0) { + //Note: this must be addiw NOT addi, otherwise the upper 32 bits + //of the 64-bit register will be incorrect. + //addiw x{dst1}, x{dst1}, {limm} + buf.emit(rvi(rv64::ADDIW, dst1, dst1, limm)); + } + //add src + if (src1 != 0) { + //c.add x{dst}, x{src1} + buf.emit(rvc(rv64::C_ADD, dst, src1)); + } + } + + //x9 = &Scratchpad[isn.imm] + //takes 3 isns, 10 bytes + static void genAddressRegImm(CodeBuffer& buf, const Instruction& isn) { + //signed offset 8-byte aligned + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()) & ScratchpadL3Mask; + //x9 = x5 + {imm} + emitImm32(buf, imm, Tmp2Reg, SpadReg, Tmp1Reg); + } + + //x9 = &Scratchpad[isn.src + isn.imm] (for reading) + //takes 5 isns, 12 bytes + static void genAddressReg(CodeBuffer& buf, const Instruction& isn) { + int shift, maskReg; + if (isn.getModMem()) { + shift = MaskL1Shift; + maskReg = MaskL1Reg; + } + else { + shift = MaskL2Shift; + maskReg = MaskL2Reg; + } + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + imm = (imm << shift) >> shift; + //x9 = x{src} + {imm} + emitImm32(buf, imm, Tmp2Reg, regR(isn.src), Tmp1Reg); + //c.and x9, x{maskReg} + buf.emit(rvc(rv64::C_AND, (Tmp2Reg + OffsetXC), (maskReg + OffsetXC))); + //c.add x9, x{spadReg} + buf.emit(rvc(rv64::C_ADD, Tmp2Reg, SpadReg)); + } + + //x8 = Scratchpad[isn] + static void loadFromScratchpad(CodeBuffer& buf, const Instruction& isn) { + if (isn.src != isn.dst) { + //x9 = &Scratchpad[isn.src + isn.imm] + genAddressReg(buf, isn); + } + else { + ///x9 = &Scratchpad[isn.imm] + genAddressRegImm(buf, isn); + } + //c.ld x8, 0(x9) + buf.emit(rvc(rv64::C_LD, Tmp2Reg + OffsetXC, Tmp1Reg + OffsetXC)); + } + + //x9 = &Scratchpad[isn.dst + isn.imm32] (for writing) + //takes 5 isns, 12-16 bytes + static void genAddressRegDst(CodeBuffer& buf, const Instruction& isn) { + if (isn.getModCond() < StoreL3Condition) { + int shift, maskReg; + if (isn.getModMem()) { + shift = MaskL1Shift; + maskReg = MaskL1Reg; + } + else { + shift = MaskL2Shift; + maskReg = MaskL2Reg; + } + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + imm = (imm << shift) >> shift; + //x9 = x{dst} + {imm} + emitImm32(buf, imm, Tmp2Reg, regR(isn.dst), Tmp1Reg); + //c.and x9, x{maskReg} + buf.emit(rvc(rv64::C_AND, Tmp2Reg + OffsetXC, maskReg + OffsetXC)); + //c.add x9, x5 + buf.emit(rvc(rv64::C_ADD, Tmp2Reg, SpadReg)); + } + else { + int shift = MaskL3Shift; + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + imm = (imm << shift) >> shift; + //x9 = x{dst} + {imm} + emitImm32(buf, imm, Tmp2Reg, regR(isn.dst), Tmp1Reg); + //and x9, x9, x1 + buf.emit(rvi(rv64::AND, Tmp2Reg, Tmp2Reg, MaskL3Reg)); + //c.add x9, x5 + buf.emit(rvc(rv64::C_ADD, Tmp2Reg, SpadReg)); + } + } + + static void emitRcpLiteral1(CodeBuffer& buf, uint64_t literal) { + //first 238 at positive offsets + if (buf.rcpCount < 238) { + buf.emitAt(LiteralPoolOffset + RcpLiteralsOffset + buf.rcpCount * 8, literal); + buf.rcpCount++; + } + //next 256 at negative offsets + else if (buf.rcpCount < 494) { + buf.emitAt(buf.rcpCount * 8 - (2048 - RcpLiteralsOffset), literal); + buf.rcpCount++; + } + else { + //checked at compile time, but double-check here + throw std::runtime_error("Literal pool overflow"); + } + } + + static void emitRcpLiteral2(CodeBuffer& buf, uint64_t literal, int32_t numLiterals) { + //store the current literal in the pool + int32_t offset = 2040 - buf.rcpCount * 8; + buf.emitAt(SuperScalarLiteralPoolRefOffset + offset, literal); + buf.rcpCount++; + if (buf.rcpCount >= numLiterals) { + return; + } + //load the next literal + offset -= 8; + int32_t imm = offset & 0xfff; + //ld x31, {offset}(x30) + buf.emit(rvi(rv64::LD, SshRcpReg, SshPoolReg, imm)); + if (imm == 0x800) { + //move pool pointer back 4KB + //c.lui x29, 0xfffff + buf.emit(rvc(rv64::C_LUI, 1, SshTmp2Reg, 31)); + //c.add x30, x29 + buf.emit(rvc(rv64::C_ADD, SshPoolReg, SshTmp2Reg)); + } + } + + static void emitJump(CodeBuffer& buf, int dst, int32_t codePos, int32_t targetPos) { + int32_t imm = targetPos - codePos; + int32_t imm20 = (imm < 0) << 11; + int32_t imm1912 = (imm >> 7) & 8160; + int32_t imm11 = (imm >> 11) & 1; + int32_t imm101 = imm & 2046; + //jal x{dst}, {imm} + buf.emitAt(codePos, rvi(rv64::JAL, dst + imm1912, 0, imm20 + imm101 + imm11)); + } + + static void emitInstruction(CompilerState& state, Instruction isn, int i) { + state.instructionOffsets[i] = state.codePos; + opcodeMap1[isn.opcode](state, isn, i); + } + + static void emitProgramPrefix(CompilerState& state, Program& prog, ProgramConfiguration& pcfg) { + state.codePos = RandomXCodePos; + state.rcpCount = 0; + state.emitAt(LiteralPoolOffset + sizeLiterals, pcfg.eMask[0]); + state.emitAt(LiteralPoolOffset + sizeLiterals + 8, pcfg.eMask[1]); + for (unsigned i = 0; i < RegistersCount; ++i) { + state.registerUsage[i] = -1; + } + for (unsigned i = 0; i < prog.getSize(); ++i) { + Instruction instr = prog(i); + instr.src %= RegistersCount; + instr.dst %= RegistersCount; + emitInstruction(state, instr, i); + } + } + + static void emitProgramSuffix(CompilerState& state, ProgramConfiguration& pcfg) { + state.emit(codeSpadStore, sizeSpadStore); + int32_t fixPos = state.codePos; + state.emit(codeLoopEnd, sizeLoopEnd); + //xor x26, x{readReg0}, x{readReg1} + state.emitAt(fixPos, rvi(rv64::XOR, SpAddr0Reg, regR(pcfg.readReg0), regR(pcfg.readReg1))); + fixPos += offsetFixContinueLoop; + //j LoopTop + emitJump(state, 0, fixPos, LoopTopPos); + state.emit(codeEpilogue, sizeEpilogue); + } + + static void generateSuperscalarCode(CodeBuffer& buf, Instruction isn, const std::vector& reciprocalCache) { + switch ((SuperscalarInstructionType)isn.opcode) + { + case randomx::SuperscalarInstructionType::ISUB_R: + //c.sub x{dst}, x{src} + buf.emit(rvc(rv64::C_SUB, regSS(isn.dst) + OffsetXC, regSS(isn.src) + OffsetXC)); + break; + case randomx::SuperscalarInstructionType::IXOR_R: + //c.xor x{dst}, x{src} + buf.emit(rvc(rv64::C_XOR, regSS(isn.dst) + OffsetXC, regSS(isn.src) + OffsetXC)); + break; + case randomx::SuperscalarInstructionType::IADD_RS: + { + int shift = isn.getModShift(); + if (shift == 0) { + //c.add x{dst}, x{src} + buf.emit(rvc(rv64::C_ADD, regSS(isn.dst), regSS(isn.src))); + } + else { +#ifdef __riscv_zba + //sh{1,2,3}add x{dst}, x{src}, x{dst} + buf.emit(rv64::SHXADD | rvrs2(regSS(isn.dst)) | rvrs1(regSS(isn.src)) | (shift << 13) | rvrd(regSS(isn.dst))); +#else + //slli x28, x{src}, {shift} + buf.emit(rvi(rv64::SLLI, SshTmp1Reg, regSS(isn.src), shift)); + //c.add x{dst}, x28 + buf.emit(rvc(rv64::C_ADD, regSS(isn.dst), SshTmp1Reg)); +#endif + } + } + break; + case randomx::SuperscalarInstructionType::IMUL_R: + //mul x{dst}, x{dst}, x{src} + buf.emit(rvi(rv64::MUL, regSS(isn.dst), regSS(isn.dst), regSS(isn.src))); + break; + case randomx::SuperscalarInstructionType::IROR_C: + { +#ifdef __riscv_zbb + int32_t imm = isn.getImm32() & 63; + //rori x{dst}, x{dst}, {imm} + buf.emit(rvi(rv64::RORI, regSS(isn.dst), regSS(isn.dst), imm)); +#else + int32_t immr = isn.getImm32() & 63; + int32_t imml = -immr & 63; + int32_t imml5 = imml >> 5; + int32_t imml40 = imml & 31; + //srli x28, x{dst}, {immr} + buf.emit(rvi(rv64::SRLI, SshTmp1Reg, regSS(isn.dst), immr)); + //c.slli x{dst}, {imml} + buf.emit(rvc(rv64::C_SLLI, imml5, regSS(isn.dst), imml40)); + //or x{dst}, x{dst}, x28 + buf.emit(rvi(rv64::OR, regSS(isn.dst), regSS(isn.dst), SshTmp1Reg)); +#endif + } + break; + case randomx::SuperscalarInstructionType::IADD_C7: + case randomx::SuperscalarInstructionType::IADD_C8: + case randomx::SuperscalarInstructionType::IADD_C9: + { + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + //x{dst} = x{dst} + {imm} + emitImm32(buf, imm, regSS(isn.dst), regSS(isn.dst), SshTmp1Reg); + } + break; + case randomx::SuperscalarInstructionType::IXOR_C7: + case randomx::SuperscalarInstructionType::IXOR_C8: + case randomx::SuperscalarInstructionType::IXOR_C9: + { + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + //x28 = {imm} + emitImm32(buf, imm, SshTmp1Reg); + //xor x{dst}, x{dst}, x28 + buf.emit(rvi(rv64::XOR, regSS(isn.dst), regSS(isn.dst), SshTmp1Reg)); + } + break; + case randomx::SuperscalarInstructionType::IMULH_R: + //mulhu x{dst}, x{dst}, x{src} + buf.emit(rvi(rv64::MULHU, regSS(isn.dst), regSS(isn.dst), regSS(isn.src))); + break; + case randomx::SuperscalarInstructionType::ISMULH_R: + //mulh x{dst}, x{dst}, x{src} + buf.emit(rvi(rv64::MULH, regSS(isn.dst), regSS(isn.dst), regSS(isn.src))); + break; + case randomx::SuperscalarInstructionType::IMUL_RCP: + //mul x{dst}, x{dst}, x31 + buf.emit(rvi(rv64::MUL, regSS(isn.dst), regSS(isn.dst), SshRcpReg)); + //load the next literal into x31 + emitRcpLiteral2(buf, reciprocalCache[isn.getImm32()], reciprocalCache.size()); + break; + default: + UNREACHABLE; + } + } + + size_t JitCompilerRV64::getCodeSize() { + return CodeSize; + } + + JitCompilerRV64::JitCompilerRV64() { + state.code = (uint8_t*)allocMemoryPages(CodeSize); + if (state.code == nullptr) + throw std::runtime_error("allocMemoryPages"); + state.emitAt(LiteralPoolOffset, codeLiterals, sizeLiterals); + state.emitAt(LiteralPoolSize, codeDataInit, sizeDataInit + sizePrologue + sizeLoopBegin); + entryDataInit = state.code + LiteralPoolSize; + entryProgram = state.code + LiteralPoolSize + sizeDataInit; + //jal x1, SuperscalarHash + emitJump(state, ReturnReg, LiteralPoolSize + offsetFixDataCall, SuperScalarHashOffset); + } + + JitCompilerRV64::~JitCompilerRV64() { + freePagedMemory(state.code, CodeSize); + } + + void JitCompilerRV64::enableAll() { + setPagesRWX(entryDataInit, ExecutableSize); + } + + void JitCompilerRV64::enableWriting() { + setPagesRW(entryDataInit, ExecutableSize); + } + + void JitCompilerRV64::enableExecution() { + setPagesRX(entryDataInit, ExecutableSize); + } + + void JitCompilerRV64::generateProgram(Program& prog, ProgramConfiguration& pcfg) { + emitProgramPrefix(state, prog, pcfg); + int32_t fixPos = state.codePos; + state.emit(codeDataRead, sizeDataRead); + //xor x8, x{readReg2}, x{readReg3} + state.emitAt(fixPos, rvi(rv64::XOR, Tmp1Reg, regR(pcfg.readReg2), regR(pcfg.readReg3))); + emitProgramSuffix(state, pcfg); + clearCache(state); + } + + void JitCompilerRV64::generateProgramLight(Program& prog, ProgramConfiguration& pcfg, uint32_t datasetOffset) { + emitProgramPrefix(state, prog, pcfg); + int32_t fixPos = state.codePos; + state.emit(codeDataReadLight, sizeDataReadLight); + //xor x8, x{readReg2}, x{readReg3} + state.emitAt(fixPos, rvi(rv64::XOR, Tmp1Reg, regR(pcfg.readReg2), regR(pcfg.readReg3))); + int32_t imm = datasetOffset / CacheLineSize; + int32_t limm = (imm << 20) >> 20; + int32_t uimm = (imm >> 12) + (limm < 0); + //lui x9, {uimm} + state.emitAt(fixPos + 4, rv64::LUI | (uimm << 12) | rvrd(Tmp2Reg)); + //addi x9, x9, {limm} + state.emitAt(fixPos + 8, rvi(rv64::ADDI, Tmp2Reg, Tmp2Reg, limm)); + fixPos += offsetFixLoopCall; + //jal x1, SuperscalarHash + emitJump(state, ReturnReg, fixPos, SuperScalarHashOffset); + emitProgramSuffix(state, pcfg); + clearCache(state); + } + + void JitCompilerRV64::generateSuperscalarHash(SuperscalarProgram programs[RANDOMX_CACHE_ACCESSES], std::vector& reciprocalCache) { + state.codePos = SuperScalarHashOffset; + state.rcpCount = 0; + state.emit(codeSshInit, sizeSshInit); + for (unsigned j = 0; j < RANDOMX_CACHE_ACCESSES; ++j) { + SuperscalarProgram& prog = programs[j]; + for (unsigned i = 0; i < prog.getSize(); ++i) { + Instruction instr = prog(i); + generateSuperscalarCode(state, instr, reciprocalCache); + } + state.emit(codeSshLoad, sizeSshLoad); + if (j < RANDOMX_CACHE_ACCESSES - 1) { + int32_t fixPos = state.codePos; + state.emit(codeSshPrefetch, sizeSshPrefetch); + //and x7, x{addrReg}, x7 + state.emitAt(fixPos, rvi(rv64::AND, SuperscalarReg, regSS(prog.getAddressRegister()), SuperscalarReg)); + } + } + state.emit(rvc(rv64::C_RET, 0, 0)); + clearCache(state); + } + + static void v1_IADD_RS(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + int shift = isn.getModShift(); + if (shift == 0) { + //c.add x{dst}, x{src} + state.emit(rvc(rv64::C_ADD, regR(isn.dst), regR(isn.src))); + } + else { +#ifdef __riscv_zba + //sh{1,2,3}add x{dst}, x{src}, x{dst} + state.emit(rv64::SHXADD | rvrs2(regR(isn.dst)) | rvrs1(regR(isn.src)) | (shift << 13) | rvrd(regR(isn.dst))); +#else + //slli x8, x{src}, {shift} + state.emit(rvi(rv64::SLLI, Tmp1Reg, regR(isn.src), shift)); + //c.add x{dst}, x8 + state.emit(rvc(rv64::C_ADD, regR(isn.dst), Tmp1Reg)); +#endif + } + if (isn.dst == RegisterNeedsDisplacement) { + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + //x{dst} = x{dst} + {imm} + emitImm32(state, imm, regR(isn.dst), regR(isn.dst), Tmp1Reg); + } + } + + static void v1_IADD_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //c.add x{dst}, x8 + state.emit(rvc(rv64::C_ADD, regR(isn.dst), Tmp1Reg)); + } + + static void v1_ISUB_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + if (isn.src != isn.dst) { + //sub x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::SUB, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + else { + int32_t imm = unsigned32ToSigned2sCompl(-isn.getImm32()); //convert to add + //x{dst} = x{dst} + {-imm} + emitImm32(state, imm, regR(isn.dst), regR(isn.dst), Tmp1Reg); + } + } + + static void v1_ISUB_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //sub x{dst}, x{dst}, x8 + state.emit(rvi(rv64::SUB, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + + static void v1_IMUL_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + if (isn.src != isn.dst) { + //mul x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + else { + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + //x8 = {imm} + emitImm32(state, imm, Tmp1Reg); + //mul x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + } + + static void v1_IMUL_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //mul x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + + static void v1_IMULH_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + //mulhu x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::MULHU, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + + static void v1_IMULH_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //mulhu x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MULHU, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + + static void v1_ISMULH_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + //mulh x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::MULH, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + + static void v1_ISMULH_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //mulh x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MULH, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + + static void v1_IMUL_RCP(HANDLER_ARGS) { + const uint32_t divisor = isn.getImm32(); + if (!isZeroOrPowerOf2(divisor)) { + state.registerUsage[isn.dst] = i; + if (state.rcpCount < 4) { + //mul x{dst}, x{dst}, x{rcp} + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), regRcp(state.rcpCount))); + } + else if (state.rcpCount < 10) { + //fmv.x.d x8, f{rcp} + state.emit(rvi(rv64::FMV_X_D, Tmp1Reg, regRcpF(state.rcpCount))); + //mul x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + else { + int32_t offset = RcpLiteralsOffset + state.rcpCount * 8; + //ld x8, {offset}(x3) + state.emit(rvi(rv64::LD, Tmp1Reg, LiteralPoolReg, offset)); + //mul x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + emitRcpLiteral1(state, randomx_reciprocal_fast(divisor)); + } + } + + static void v1_INEG_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + //sub x{dst}, x0, x{dst} + state.emit(rvi(rv64::SUB, regR(isn.dst), 0, regR(isn.dst))); + } + + static void v1_IXOR_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + if (isn.src != isn.dst) { + //xor x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::XOR, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + else { + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + //x8 = {imm} + emitImm32(state, imm, Tmp1Reg); + //xor x{dst}, x{dst}, x8 + state.emit(rvi(rv64::XOR, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + } + + static void v1_IXOR_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //xor x{dst}, x{dst}, x8 + state.emit(rvi(rv64::XOR, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + + static void v1_IROR_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; +#ifdef __riscv_zbb + if (isn.src != isn.dst) { + //ror x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::ROR, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + else { + int32_t imm = isn.getImm32() & 63; + //rori x{dst}, x{dst}, {imm} + state.emit(rvi(rv64::RORI, regR(isn.dst), regR(isn.dst), imm)); + } +#else + if (isn.src != isn.dst) { + //sub x8, x0, x{src} + state.emit(rvi(rv64::SUB, Tmp1Reg, 0, regR(isn.src))); + //srl x9, x{dst}, x{src} + state.emit(rvi(rv64::SRL, Tmp2Reg, regR(isn.dst), regR(isn.src))); + //sll x{dst}, x{dst}, x8 + state.emit(rvi(rv64::SLL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + //or x{dst}, x{dst}, x9 + state.emit(rvi(rv64::OR, regR(isn.dst), regR(isn.dst), Tmp2Reg)); + } + else { + int32_t immr = isn.getImm32() & 63; + int32_t imml = -immr & 63; + int32_t imml5 = imml >> 5; + int32_t imml40 = imml & 31; + //srli x8, x{dst}, {immr} + state.emit(rvi(rv64::SRLI, Tmp1Reg, regR(isn.dst), immr)); + //c.slli x{dst}, {imml} + state.emit(rvc(rv64::C_SLLI, imml5, regR(isn.dst), imml40)); + //or x{dst}, x{dst}, x8 + state.emit(rvi(rv64::OR, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } +#endif + } + + static void v1_IROL_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; +#ifdef __riscv_zbb + if (isn.src != isn.dst) { + //rol x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::ROL, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + else { + int32_t imm = -isn.getImm32() & 63; + //rori x{dst}, x{dst}, {imm} + state.emit(rvi(rv64::RORI, regR(isn.dst), regR(isn.dst), imm)); + } +#else + if (isn.src != isn.dst) { + //sub x8, x0, x{src} + state.emit(rvi(rv64::SUB, Tmp1Reg, 0, regR(isn.src))); + //sll x9, x{dst}, x{src} + state.emit(rvi(rv64::SLL, Tmp2Reg, regR(isn.dst), regR(isn.src))); + //srl x{dst}, x{dst}, x8 + state.emit(rvi(rv64::SRL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + //or x{dst}, x{dst}, x9 + state.emit(rvi(rv64::OR, regR(isn.dst), regR(isn.dst), Tmp2Reg)); + } + else { + int32_t imml = isn.getImm32() & 63; + int32_t immr = -imml & 63; + int32_t imml5 = imml >> 5; + int32_t imml40 = imml & 31; + //srli x8, x{dst}, {immr} + state.emit(rvi(rv64::SRLI, Tmp1Reg, regR(isn.dst), immr)); + //c.slli x{dst}, {imml} + state.emit(rvc(rv64::C_SLLI, imml5, regR(isn.dst), imml40)); + //or x{dst}, x{dst}, x8 + state.emit(rvi(rv64::OR, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } +#endif + } + + static void v1_ISWAP_R(HANDLER_ARGS) { + if (isn.src != isn.dst) { + state.registerUsage[isn.dst] = i; + state.registerUsage[isn.src] = i; + //c.mv x8, x{dst} + state.emit(rvc(rv64::C_MV, Tmp1Reg, regR(isn.dst))); + //c.mv x{dst}, x{src} + state.emit(rvc(rv64::C_MV, regR(isn.dst), regR(isn.src))); + //c.mv x{src}, x8 + state.emit(rvc(rv64::C_MV, regR(isn.src), Tmp1Reg)); + } + } + + static void v1_FSWAP_R(HANDLER_ARGS) { + //fmv.d f24, f{dst_lo} + state.emit(rvi(rv64::FMV_D, Tmp1RegF, regLoF(isn.dst), regLoF(isn.dst))); + //fmv.d f{dst_lo}, f{dst_hi} + state.emit(rvi(rv64::FMV_D, regLoF(isn.dst), regHiF(isn.dst), regHiF(isn.dst))); + //fmv.d f{dst_hi}, f24 + state.emit(rvi(rv64::FMV_D, regHiF(isn.dst), Tmp1RegF, Tmp1RegF)); + } + + static void v1_FADD_R(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + isn.src %= RegisterCountFlt; + //fadd.d f{dst_lo}, f{dst_lo}, f{src_lo} + state.emit(rvi(rv64::FADD_D, regLoF(isn.dst), regLoF(isn.dst), regLoA(isn.src))); + //fadd.d f{dst_hi}, f{dst_hi}, f{src_hi} + state.emit(rvi(rv64::FADD_D, regHiF(isn.dst), regHiF(isn.dst), regHiA(isn.src))); + } + + static void v1_FADD_M(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + //x9 = mem + genAddressReg(state, isn); + //lw x8, 0(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, Tmp1Reg + OffsetXC)); + //lw x9, 4(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, 16 + Tmp2Reg + OffsetXC)); + //fcvt.d.w f24, x8 + state.emit(rvi(rv64::FCVT_D_W, Tmp1RegF, Tmp1Reg)); + //fcvt.d.w f25, x9 + state.emit(rvi(rv64::FCVT_D_W, Tmp2RegF, Tmp2Reg)); + //fadd.d f{dst_lo}, f{dst_lo}, f24 + state.emit(rvi(rv64::FADD_D, regLoF(isn.dst), regLoF(isn.dst), Tmp1RegF)); + //fadd.d f{dst_hi}, f{dst_hi}, f25 + state.emit(rvi(rv64::FADD_D, regHiF(isn.dst), regHiF(isn.dst), Tmp2RegF)); + } + + static void v1_FSUB_R(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + isn.src %= RegisterCountFlt; + //fsub.d f{dst_lo}, f{dst_lo}, f{src_lo} + state.emit(rvi(rv64::FSUB_D, regLoF(isn.dst), regLoF(isn.dst), regLoA(isn.src))); + //fsub.d f{dst_hi}, f{dst_hi}, f{src_hi} + state.emit(rvi(rv64::FSUB_D, regHiF(isn.dst), regHiF(isn.dst), regHiA(isn.src))); + } + + static void v1_FSUB_M(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + //x9 = mem + genAddressReg(state, isn); + //c.lw x8, 0(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, Tmp1Reg + OffsetXC)); + //c.lw x9, 4(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, 16 + Tmp2Reg + OffsetXC)); + //fcvt.d.w f24, x8 + state.emit(rvi(rv64::FCVT_D_W, Tmp1RegF, Tmp1Reg)); + //fcvt.d.w f25, x9 + state.emit(rvi(rv64::FCVT_D_W, Tmp2RegF, Tmp2Reg)); + //fsub.d f{dst_lo}, f{dst_lo}, f24 + state.emit(rvi(rv64::FSUB_D, regLoF(isn.dst), regLoF(isn.dst), Tmp1RegF)); + //fsub.d f{dst_hi}, f{dst_hi}, f25 + state.emit(rvi(rv64::FSUB_D, regHiF(isn.dst), regHiF(isn.dst), Tmp2RegF)); + } + + static void v1_FSCAL_R(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + //fmv.x.d x8, f{dst_lo} + state.emit(rvi(rv64::FMV_X_D, Tmp1Reg, regLoF(isn.dst))); + //fmv.x.d x9, f{dst_hi} + state.emit(rvi(rv64::FMV_X_D, Tmp2Reg, regHiF(isn.dst))); + //c.xor x8, x12 + state.emit(rvc(rv64::C_XOR, Tmp1Reg + OffsetXC, MaskFscalReg + OffsetXC)); + //c.xor x9, x12 + state.emit(rvc(rv64::C_XOR, Tmp2Reg + OffsetXC, MaskFscalReg + OffsetXC)); + //fmv.d.x f{dst_lo}, x8 + state.emit(rvi(rv64::FMV_D_X, regLoF(isn.dst), Tmp1Reg)); + //fmv.d.x f{dst_hi}, x9 + state.emit(rvi(rv64::FMV_D_X, regHiF(isn.dst), Tmp2Reg)); + } + + static void v1_FMUL_R(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + isn.src %= RegisterCountFlt; + //fmul.d f{dst_lo}, f{dst_lo}, f{src_lo} + state.emit(rvi(rv64::FMUL_D, regLoE(isn.dst), regLoE(isn.dst), regLoA(isn.src))); + //fmul.d f{dst_hi}, f{dst_hi}, f{src_hi} + state.emit(rvi(rv64::FMUL_D, regHiE(isn.dst), regHiE(isn.dst), regHiA(isn.src))); + } + + static void v1_FDIV_M(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + //x9 = mem + genAddressReg(state, isn); + //lw x8, 0(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, Tmp1Reg + OffsetXC)); + //lw x9, 4(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, 16 + Tmp2Reg + OffsetXC)); + //fcvt.d.w f24, x8 + state.emit(rvi(rv64::FCVT_D_W, Tmp1RegF, Tmp1Reg)); + //fcvt.d.w f25, x9 + state.emit(rvi(rv64::FCVT_D_W, Tmp2RegF, Tmp2Reg)); + //fmv.x.d x8, f24 + state.emit(rvi(rv64::FMV_X_D, Tmp1Reg, Tmp1RegF)); + //fmv.x.d x9, f25 + state.emit(rvi(rv64::FMV_X_D, Tmp2Reg, Tmp2RegF)); + //c.and x8, x13 + state.emit(rvc(rv64::C_AND, Tmp1Reg + OffsetXC, MaskEclear + OffsetXC)); + //c.and x9, x13 + state.emit(rvc(rv64::C_AND, Tmp2Reg + OffsetXC, MaskEclear + OffsetXC)); + //c.or x8, x14 + state.emit(rvc(rv64::C_OR, Tmp1Reg + OffsetXC, MaskEsetLo + OffsetXC)); + //c.or x9, x15 + state.emit(rvc(rv64::C_OR, Tmp2Reg + OffsetXC, MaskEsetHi + OffsetXC)); + //fmv.d.x f24, x8 + state.emit(rvi(rv64::FMV_D_X, Tmp1RegF, Tmp1Reg)); + //fmv.d.x f25, x9 + state.emit(rvi(rv64::FMV_D_X, Tmp2RegF, Tmp2Reg)); + //fdiv.d f{dst_lo}, f{dst_lo}, f24 + state.emit(rvi(rv64::FDIV_D, regLoE(isn.dst), regLoE(isn.dst), Tmp1RegF)); + //fdiv.d f{dst_hi}, f{dst_hi}, f25 + state.emit(rvi(rv64::FDIV_D, regHiE(isn.dst), regHiE(isn.dst), Tmp2RegF)); + } + + static void v1_FSQRT_R(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + //fsqrt.d f{dst_lo}, f{dst_lo} + state.emit(rvi(rv64::FSQRT_D, regLoE(isn.dst), regLoE(isn.dst))); + //fsqrt.d f{dst_hi}, f{dst_hi} + state.emit(rvi(rv64::FSQRT_D, regHiE(isn.dst), regHiE(isn.dst))); + } + + static void v1_CBRANCH(HANDLER_ARGS) { + int reg = isn.dst; + int target = state.registerUsage[reg] + 1; + int shift = isn.getModCond() + ConditionOffset; + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + imm |= (1UL << shift); + if (ConditionOffset > 0 || shift > 0) + imm &= ~(1UL << (shift - 1)); + //x8 = branchMask + emitImm32(state, (int32_t)ConditionMask << shift, Tmp1Reg); + //x{dst} += {imm} + emitImm32(state, imm, regR(isn.dst), regR(isn.dst), Tmp2Reg); + //and x8, x8, x{dst} + state.emit(rvi(rv64::AND, Tmp1Reg, Tmp1Reg, regR(isn.dst))); + int32_t targetPos = state.instructionOffsets[target]; + int offset = targetPos - state.codePos; + if (offset >= -256) { //C.BEQZ only has a range of 256B + //c.beqz x8, {offset} + int imm8 = 1; //sign bit is always 1 + int imm21 = offset & 6; //offset[2:1] + int imm5 = (offset >> 5) & 1; //offset[5] + int imm43 = offset & 24; //offset[4:3] + int imm76 = (offset >> 3) & 24; //offset[7:6] + state.emit(rvc(rv64::C_BEQZ, imm8, imm43 + (Tmp1Reg + OffsetXC), imm76 + imm21 + imm5)); + } + else if (offset >= -4096) { //BEQ only has a range of 4KB + //beq x8, x0, offset + int imm12 = 1 << 11; //sign bit is always 1 + int imm105 = offset & 2016; //offset[10:5] + int imm41 = offset & 30; //offset[4:1] + int imm11 = (offset >> 11) & 1; //offset[11] + state.emit(rvi(rv64::BEQ, imm41 + imm11, Tmp1Reg, imm12 + imm105)); + } + else { + //c.bnez x8, +6 + state.emit(rvc(rv64::C_BNEZ, Tmp1Reg + OffsetXC, 6)); + //j targetPos + emitJump(state, 0, state.codePos, targetPos); + state.codePos += 4; + } + //mark all registers as used + for (unsigned j = 0; j < RegistersCount; ++j) { + state.registerUsage[j] = i; + } + } + + static void v1_CFROUND(HANDLER_ARGS) { + int32_t imm = (isn.getImm32() - 2) & 63; //-2 to avoid a later left shift to multiply by 4 + if (imm != 0) { +#ifdef __riscv_zbb + //rori x8, x{src}, {imm} + state.emit(rvi(rv64::RORI, Tmp1Reg, regR(isn.src), imm)); +#else + int32_t imml = -imm & 63; + //srli x8, x{src}, {imm} + state.emit(rvi(rv64::SRLI, Tmp1Reg, regR(isn.src), imm)); + //slli x9, x{src}, {imml} + state.emit(rvi(rv64::SLLI, Tmp2Reg, regR(isn.src), imml)); + //c.or x8, x9 + state.emit(rvc(rv64::C_OR, Tmp1Reg + OffsetXC, Tmp2Reg + OffsetXC)); +#endif + //c.andi x8, 12 + state.emit(rvc(rv64::C_ANDI, Tmp1Reg + OffsetXC, 12)); + } + else { + //and x8, x{src}, 12 + state.emit(rvi(rv64::ANDI, Tmp1Reg, regR(isn.src), 12)); + } + //c.add x8, x3 + state.emit(rvc(rv64::C_ADD, Tmp1Reg, LiteralPoolReg)); + //c.lw x8, 64(x8) + state.emit(rvc(rv64::C_LW, Tmp1Reg + OffsetXC, 8 + Tmp1Reg + OffsetXC)); + //fsrm x8 + state.emit(rvi(rv64::FSRM, 0, Tmp1Reg, 0)); + } + + static void v1_ISTORE(HANDLER_ARGS) { + genAddressRegDst(state, isn); + //sd x{src}, 0(x9) + state.emit(rvi(rv64::SD, 0, Tmp2Reg, regR(isn.src))); + } + + static void v1_NOP(HANDLER_ARGS) { + } +} + +#include "instruction_weights.hpp" + +namespace { + +#define INST_HANDLE1(x) REPN(&randomx::v1_##x, WT(x)) +#define INST_HANDLE2(x) REPN(&randomx::v2_##x, WT(x)) + + InstructionHandler* opcodeMap1[256] = { + INST_HANDLE1(IADD_RS) + INST_HANDLE1(IADD_M) + INST_HANDLE1(ISUB_R) + INST_HANDLE1(ISUB_M) + INST_HANDLE1(IMUL_R) + INST_HANDLE1(IMUL_M) + INST_HANDLE1(IMULH_R) + INST_HANDLE1(IMULH_M) + INST_HANDLE1(ISMULH_R) + INST_HANDLE1(ISMULH_M) + INST_HANDLE1(IMUL_RCP) + INST_HANDLE1(INEG_R) + INST_HANDLE1(IXOR_R) + INST_HANDLE1(IXOR_M) + INST_HANDLE1(IROR_R) + INST_HANDLE1(IROL_R) + INST_HANDLE1(ISWAP_R) + INST_HANDLE1(FSWAP_R) + INST_HANDLE1(FADD_R) + INST_HANDLE1(FADD_M) + INST_HANDLE1(FSUB_R) + INST_HANDLE1(FSUB_M) + INST_HANDLE1(FSCAL_R) + INST_HANDLE1(FMUL_R) + INST_HANDLE1(FDIV_M) + INST_HANDLE1(FSQRT_R) + INST_HANDLE1(CBRANCH) + INST_HANDLE1(CFROUND) + INST_HANDLE1(ISTORE) + INST_HANDLE1(NOP) + }; +} \ No newline at end of file diff --git a/src/RandomX/src/jit_compiler_rv64.hpp b/src/RandomX/src/jit_compiler_rv64.hpp new file mode 100644 index 000000000..aaae57e36 --- /dev/null +++ b/src/RandomX/src/jit_compiler_rv64.hpp @@ -0,0 +1,69 @@ +/* +Copyright (c) 2023 tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include +#include +#include "jit_compiler.hpp" + +namespace randomx { + + class Program; + struct ProgramConfiguration; + class SuperscalarProgram; + class Instruction; + + class JitCompilerRV64 { + public: + JitCompilerRV64(); + ~JitCompilerRV64(); + void generateProgram(Program&, ProgramConfiguration&); + void generateProgramLight(Program&, ProgramConfiguration&, uint32_t); + void generateSuperscalarHash(SuperscalarProgram programs[RANDOMX_CACHE_ACCESSES], std::vector&); + void generateDatasetInitCode() {} + ProgramFunc* getProgramFunc() { + return (ProgramFunc*)entryProgram; + } + DatasetInitFunc* getDatasetInitFunc() { + return (DatasetInitFunc*)entryDataInit; + } + uint8_t* getCode() { + return state.code; + } + size_t getCodeSize(); + void enableWriting(); + void enableExecution(); + void enableAll(); + private: + CompilerState state; + void* entryDataInit; + void* entryProgram; + }; +} diff --git a/src/RandomX/src/jit_compiler_rv64_static.S b/src/RandomX/src/jit_compiler_rv64_static.S new file mode 100644 index 000000000..240bbf5f4 --- /dev/null +++ b/src/RandomX/src/jit_compiler_rv64_static.S @@ -0,0 +1,1235 @@ +/* +Copyright (c) 2023 tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#define DECL(x) x + +.text +.option rvc + +#include "configuration.h" + +.global DECL(randomx_riscv64_literals) +.global DECL(randomx_riscv64_literals_end) +.global DECL(randomx_riscv64_data_init) +.global DECL(randomx_riscv64_fix_data_call) +.global DECL(randomx_riscv64_prologue) +.global DECL(randomx_riscv64_loop_begin) +.global DECL(randomx_riscv64_data_read) +.global DECL(randomx_riscv64_data_read_light) +.global DECL(randomx_riscv64_fix_loop_call) +.global DECL(randomx_riscv64_spad_store) +.global DECL(randomx_riscv64_spad_store_hardaes) +.global DECL(randomx_riscv64_spad_store_softaes) +.global DECL(randomx_riscv64_loop_end) +.global DECL(randomx_riscv64_fix_continue_loop) +.global DECL(randomx_riscv64_epilogue) +.global DECL(randomx_riscv64_softaes) +.global DECL(randomx_riscv64_program_end) +.global DECL(randomx_riscv64_ssh_init) +.global DECL(randomx_riscv64_ssh_load) +.global DECL(randomx_riscv64_ssh_prefetch) +.global DECL(randomx_riscv64_ssh_end) + +/* The literal pool can fit at most 494 IMUL_RCP literals */ +#if RANDOMX_PROGRAM_SIZE > 494 + #error RANDOMX_PROGRAM_SIZE larger than 494 is not supported. +#endif + +#define RANDOMX_CACHE_MASK (RANDOMX_ARGON_MEMORY*16-1) + +/* shared literal pool: 4 KB */ + /* space for 256 IMUL_RCP literals -2048 */ + /* filled by JIT compiler */ +DECL(randomx_riscv64_literals): +literal_pool: + /* SuperscalarHash constants +0 */ + .dword 6364136223846793005 + .dword 9298411001130361340 + .dword 12065312585734608966 + .dword 9306329213124626780 + .dword 5281919268842080866 + .dword 10536153434571861004 + .dword 3398623926847679864 + .dword 9549104520008361294 + /* CFROUND lookup table +64 */ + .word 0x00000000 /* RTN */ + .word 0x00000002 /* RDN */ + .word 0x00000003 /* RUP */ + .word 0x00000001 /* RTZ */ + /* mask literals +80,+84,+88,+92,+96,+104 */ + .word (RANDOMX_SCRATCHPAD_L1-8) + .word (RANDOMX_SCRATCHPAD_L2-8) + .word (RANDOMX_SCRATCHPAD_L3-64) + .word (RANDOMX_DATASET_BASE_SIZE-64) + .dword 0x80f0000000000000 + .dword 0x00ffffffffffffff +DECL(randomx_riscv64_literals_end): + /* E reg. set masks, +112,+120 */ + .dword 0 /* filled by JIT compiler */ + .dword 0 /* filled by JIT compiler */ + /* soft AES table addresses, +128,+136 */ + .dword 0 /* filled by JIT compiler */ + .dword 0 /* filled by JIT compiler */ + /* space for 238 IMUL_RCP literals, +144 */ + .fill 238,8,0 /* filled by JIT compiler */ + +/* ================================= */ +/* Dataset init function entry point */ +/* ================================= */ + +/* Register allocation: + ---------------------- + x0 -> zero + x1 -> temp/return address + x2 -> stack pointer (sp) + x3 -> literal pool pointer + x5 -> dataset pointer + x6 -> cache pointer + x7 -> temp/itemNumber + x8-x15 -> SuperscalarHash registers + x16 -> itemNumber + x17 -> endItem + x28-x31 -> temp + + Stack layout: + ------------------------ + sp+ + 0 -> return address + 8 -> saved x3 + 16 -> saved x8-x9 + 32 -> caller stack +*/ +DECL(randomx_riscv64_data_init): + addi sp, sp, -32 + /* dataset ptr */ + mv x5, x11 + /* cache->memory */ + ld x6, 0(x10) + /* callee saved registers */ + sd x1, 0(sp) + sd x3, 8(sp) + /* literal pool */ + lla x3, literal_pool + sd x8, 16(sp) + sd x9, 24(sp) + /* startItem */ + mv x16, x12 + /* endItem */ + mv x17, x13 +init_item: + mv x7, x16 +DECL(randomx_riscv64_fix_data_call): + jal superscalar_hash /* JIT compiler will adjust the offset */ + sd x8, 0(x5) + sd x9, 8(x5) + sd x10, 16(x5) + sd x11, 24(x5) + sd x12, 32(x5) + sd x13, 40(x5) + sd x14, 48(x5) + sd x15, 56(x5) + addi x5, x5, 64 + addi x16, x16, 1 + bltu x16, x17, init_item + ld x1, 0(sp) + ld x3, 8(sp) + ld x8, 16(sp) + ld x9, 24(sp) + addi sp, sp, 32 + ret + +/* ====================================== */ +/* Program execution function entry point */ +/* ====================================== */ + +/* Register allocation: + ---------------------- + x0 -> zero + x1 -> temp/scratchpad L3 mask + x2 -> stack pointer (sp) + x3 -> literal pool pointer + x5 -> scratchpad pointer + x6 -> dataset/cache pointer + x7 -> temp/next dataset access + x8 -> temp + x9 -> temp + x10 -> scratchpad L1 mask (0x0000000000003ff8) + x11 -> scratchpad L2 mask (0x000000000003fff8) + x12 -> FSCAL_R mask (0x80f0000000000000) + x13 -> E reg. clear mask (0x00ffffffffffffff) + x14 -> E reg. set mask (0x3*00000000******) + x15 -> E reg. set mask (0x3*00000000******) + x16-x23 -> VM registers "r0"-"r7" + x24 -> iteration counter "ic" + x25 -> VM registers "mx", "ma" + x26 -> spAddr0 + x27 -> spAddr1 + x28-x31 -> temp/literals for IMUL_RCP (4x) + + (Note: We avoid using x4 because it breaks debugging with gdb.) + + f0-f7 -> VM registers "f0"-"f3" + f8-f15 -> VM registers "e0"-"e3" + f16-f23 -> VM registers "a0"-"a3" + f24-f25 -> temp + f26-f31 -> literals for IMUL_RCP (6x) + + Stack layout: + ------------------------ + sp+ + 0 -> return address + 8 -> register file ptr + 16 -> saved x3-x4 + 32 -> saved x8-x9 + 48 -> saved x18-x27 + 128 -> saved f8-f9 + 144 -> saved f18-f27 + 224 -> caller stack +*/ + +DECL(randomx_riscv64_prologue): + addi sp, sp, -224 + /* scratchpad pointer */ + mv x5, x12 + /* register file pointer */ + sd x10, 8(sp) + /* callee saved registers */ + sd x3, 16(sp) + sd x8, 32(sp) + sd x9, 40(sp) + sd x18, 48(sp) + sd x19, 56(sp) + sd x20, 64(sp) + sd x21, 72(sp) + sd x22, 80(sp) + sd x23, 88(sp) + sd x24, 96(sp) + sd x25, 104(sp) + sd x26, 112(sp) + sd x27, 120(sp) + fsd f8, 128(sp) + fsd f9, 136(sp) + fsd f18, 144(sp) + fsd f19, 152(sp) + fsd f20, 160(sp) + fsd f21, 168(sp) + fsd f22, 176(sp) + fsd f23, 184(sp) + fsd f24, 192(sp) + fsd f25, 200(sp) + fsd f26, 208(sp) + fsd f27, 216(sp) + /* iteration counter */ + mv x24, x13 + /* return address */ + sd x1, 0(sp) + /* literal pool */ + lla x3, literal_pool + /* load (ma, mx) */ + ld x25, 0(x11) + /* dataset ptr */ + ld x6, 8(x11) + /* load dataset mask */ + lwu x1, 92(x3) + /* zero registers r0-r3, load a0-a1 */ + li x16, 0 + fld f16, 192(x10) + li x17, 0 + fld f17, 200(x10) + srli x7, x25, 32 /* x7 = ma */ + li x18, 0 + fld f18, 208(x10) + mv x27, x7 /* x27 = ma */ + li x19, 0 + fld f19, 216(x10) + /* set dataset read address */ + and x7, x7, x1 + add x7, x7, x6 + /* zero registers r4-r7, load a2-a3 */ + li x20, 0 + fld f20, 224(x10) + li x21, 0 + fld f21, 232(x10) + li x22, 0 + fld f22, 240(x10) + li x23, 0 + fld f23, 248(x10) + /* load L3 mask */ + lwu x1, 88(x3) + /* load scratchpad masks */ + lwu x10, 80(x3) + lwu x11, 84(x3) + /* set spAddr0, spAddr1 */ + and x26, x25, x1 + and x27, x27, x1 + add x26, x26, x5 + add x27, x27, x5 + /* align L3 mask */ + addi x1, x1, 56 + /* FSCAL, E reg. masks */ + ld x12, 96(x3) + ld x13, 104(x3) + ld x14, 112(x3) + ld x15, 120(x3) + /* IMUL_RCP literals */ + fld f26, 176(x3) + fld f27, 184(x3) + fld f28, 192(x3) + fld f29, 200(x3) + fld f30, 208(x3) + fld f31, 216(x3) + +.balign 4 +DECL(randomx_riscv64_loop_begin): +loop_begin: + /* mix integer registers */ + ld x8, 0(x26) + ld x9, 8(x26) + ld x30, 16(x26) + ld x31, 24(x26) + xor x16, x16, x8 + ld x8, 32(x26) + xor x17, x17, x9 + ld x9, 40(x26) + xor x18, x18, x30 + ld x30, 48(x26) + xor x19, x19, x31 + ld x31, 56(x26) + xor x20, x20, x8 + lw x8, 0(x27) + xor x21, x21, x9 + lw x9, 4(x27) + xor x22, x22, x30 + lw x30, 8(x27) + xor x23, x23, x31 + lw x31, 12(x27) + /* load F registers */ + fcvt.d.w f0, x8 + lw x8, 16(x27) + fcvt.d.w f1, x9 + lw x9, 20(x27) + fcvt.d.w f2, x30 + lw x30, 24(x27) + fcvt.d.w f3, x31 + lw x31, 28(x27) + fcvt.d.w f4, x8 + lw x8, 32(x27) + fcvt.d.w f5, x9 + lw x9, 36(x27) + fcvt.d.w f6, x30 + lw x30, 40(x27) + fcvt.d.w f7, x31 + lw x31, 44(x27) + /* load E registers */ + fcvt.d.w f8, x8 + lw x8, 48(x27) + fcvt.d.w f9, x9 + lw x9, 52(x27) + fcvt.d.w f10, x30 + lw x30, 56(x27) + fcvt.d.w f11, x31 + lw x31, 60(x27) + fcvt.d.w f12, x8 + fmv.x.d x8, f8 + fcvt.d.w f13, x9 + fmv.x.d x9, f9 + fcvt.d.w f14, x30 + fmv.x.d x30, f10 + fcvt.d.w f15, x31 + fmv.x.d x31, f11 + and x8, x8, x13 + and x9, x9, x13 + or x8, x8, x14 + or x9, x9, x15 + and x30, x30, x13 + and x31, x31, x13 + or x30, x30, x14 + or x31, x31, x15 + fmv.d.x f8, x8 + fmv.d.x f9, x9 + fmv.d.x f10, x30 + fmv.d.x f11, x31 + fmv.x.d x8, f12 + fmv.x.d x9, f13 + fmv.x.d x30, f14 + fmv.x.d x31, f15 + and x8, x8, x13 + and x9, x9, x13 + or x8, x8, x14 + or x9, x9, x15 + fmv.d.x f12, x8 + fmv.d.x f13, x9 + and x30, x30, x13 + and x31, x31, x13 + or x30, x30, x14 + or x31, x31, x15 + fmv.d.x f14, x30 + fmv.d.x f15, x31 + /* reload clobbered IMUL_RCP regs */ + ld x28, 144(x3) + ld x29, 152(x3) + ld x30, 160(x3) + ld x31, 168(x3) + +DECL(randomx_riscv64_data_read): + xor x8, x20, x22 /* JIT compiler will adjust the registers */ + /* load dataset mask */ + lwu x1, 92(x3) + /* zero-extend x8 */ +#ifdef __riscv_zba + zext.w x8, x8 +#else + slli x8, x8, 32 + srli x8, x8, 32 +#endif + /* update "mx" */ + xor x25, x25, x8 + /* read dataset and update registers */ + ld x8, 0(x7) + ld x9, 8(x7) + ld x30, 16(x7) + ld x31, 24(x7) + xor x16, x16, x8 + ld x8, 32(x7) + xor x17, x17, x9 + ld x9, 40(x7) + xor x18, x18, x30 + ld x30, 48(x7) + xor x19, x19, x31 + ld x31, 56(x7) + xor x20, x20, x8 + /* calculate the next dataset address */ + and x7, x25, x1 + xor x21, x21, x9 + add x7, x7, x6 + xor x22, x22, x30 + /* prefetch - doesn't seem to have any effect */ + /* ld x0, 0(x7) */ + xor x23, x23, x31 + /* swap mx <-> ma */ +#ifdef __riscv_zbb + rori x25, x25, 32 +#else + srli x9, x25, 32 + slli x25, x25, 32 + or x25, x25, x9 +#endif + +DECL(randomx_riscv64_data_read_light): + xor x8, x20, x22 /* JIT compiler will adjust the registers */ + /* load dataset offset */ + lui x9, 0x02000 /* JIT compiler will adjust the immediate */ + addi x9, x9, -64 + /* load dataset mask */ + lwu x1, 92(x3) + /* swap mx <-> ma */ +#ifdef __riscv_zbb + rori x25, x25, 32 +#else + srli x31, x25, 32 + slli x25, x25, 32 + or x25, x25, x31 +#endif + slli x8, x8, 32 + /* update "mx" */ + xor x25, x25, x8 + /* the next dataset item */ + and x7, x25, x1 + srli x7, x7, 6 + add x7, x7, x9 +DECL(randomx_riscv64_fix_loop_call): + jal superscalar_hash /* JIT compiler will adjust the offset */ + xor x16, x16, x8 + xor x17, x17, x9 + xor x18, x18, x10 + xor x19, x19, x11 + xor x20, x20, x12 + xor x21, x21, x13 + xor x22, x22, x14 + xor x23, x23, x15 + /* restore clobbered registers */ + lwu x10, 80(x3) + lwu x11, 84(x3) + ld x12, 96(x3) + ld x13, 104(x3) + ld x14, 112(x3) + ld x15, 120(x3) + +DECL(randomx_riscv64_spad_store): + /* store integer registers */ + sd x16, 0(x27) + sd x17, 8(x27) + sd x18, 16(x27) + sd x19, 24(x27) + sd x20, 32(x27) + sd x21, 40(x27) + sd x22, 48(x27) + sd x23, 56(x27) + /* XOR and store f0,e0 */ + fmv.x.d x8, f0 + fmv.x.d x9, f8 + fmv.x.d x30, f1 + fmv.x.d x31, f9 + xor x8, x8, x9 + xor x30, x30, x31 + sd x8, 0(x26) + fmv.d.x f0, x8 + sd x30, 8(x26) + fmv.d.x f1, x30 + /* XOR and store f1,e1 */ + fmv.x.d x8, f2 + fmv.x.d x9, f10 + fmv.x.d x30, f3 + fmv.x.d x31, f11 + xor x8, x8, x9 + xor x30, x30, x31 + sd x8, 16(x26) + fmv.d.x f2, x8 + sd x30, 24(x26) + fmv.d.x f3, x30 + /* XOR and store f2,e2 */ + fmv.x.d x8, f4 + fmv.x.d x9, f12 + fmv.x.d x30, f5 + fmv.x.d x31, f13 + xor x8, x8, x9 + xor x30, x30, x31 + sd x8, 32(x26) + fmv.d.x f4, x8 + sd x30, 40(x26) + fmv.d.x f5, x30 + /* XOR and store f3,e3 */ + fmv.x.d x8, f6 + fmv.x.d x9, f14 + fmv.x.d x30, f7 + fmv.x.d x31, f15 + xor x8, x8, x9 + xor x30, x30, x31 + sd x8, 48(x26) + fmv.d.x f6, x8 + sd x30, 56(x26) + fmv.d.x f7, x30 + +DECL(randomx_riscv64_spad_store_hardaes): + nop /* not implemented */ + +DECL(randomx_riscv64_spad_store_softaes): + /* store integer registers */ + sd x16, 0(x27) + sd x17, 8(x27) + sd x18, 16(x27) + sd x19, 24(x27) + sd x20, 32(x27) + sd x21, 40(x27) + sd x22, 48(x27) + sd x23, 56(x27) + /* process f0 with 4 AES rounds */ + fmv.x.d x8, f8 + fmv.x.d x10, f9 + fmv.x.d x30, f0 + fmv.x.d x31, f1 + jal softaes_enc + fmv.x.d x8, f10 + fmv.x.d x10, f11 + jal softaes_enc + fmv.x.d x8, f12 + fmv.x.d x10, f13 + jal softaes_enc + fmv.x.d x8, f14 + fmv.x.d x10, f15 + jal softaes_enc + sd x30, 0(x26) + fmv.d.x f0, x30 + sd x31, 8(x26) + fmv.d.x f1, x31 + /* process f1 with 4 AES rounds */ + fmv.x.d x8, f8 + fmv.x.d x10, f9 + fmv.x.d x30, f2 + fmv.x.d x31, f3 + jal softaes_dec + fmv.x.d x8, f10 + fmv.x.d x10, f11 + jal softaes_dec + fmv.x.d x8, f12 + fmv.x.d x10, f13 + jal softaes_dec + fmv.x.d x8, f14 + fmv.x.d x10, f15 + jal softaes_dec + sd x30, 16(x26) + fmv.d.x f2, x30 + sd x31, 24(x26) + fmv.d.x f3, x31 + /* process f2 with 4 AES rounds */ + fmv.x.d x8, f8 + fmv.x.d x10, f9 + fmv.x.d x30, f4 + fmv.x.d x31, f5 + jal softaes_enc + fmv.x.d x8, f10 + fmv.x.d x10, f11 + jal softaes_enc + fmv.x.d x8, f12 + fmv.x.d x10, f13 + jal softaes_enc + fmv.x.d x8, f14 + fmv.x.d x10, f15 + jal softaes_enc + sd x30, 32(x26) + fmv.d.x f4, x30 + sd x31, 40(x26) + fmv.d.x f5, x31 + /* process f3 with 4 AES rounds */ + fmv.x.d x8, f8 + fmv.x.d x10, f9 + fmv.x.d x30, f6 + fmv.x.d x31, f7 + jal softaes_dec + fmv.x.d x8, f10 + fmv.x.d x10, f11 + jal softaes_dec + fmv.x.d x8, f12 + fmv.x.d x10, f13 + jal softaes_dec + fmv.x.d x8, f14 + fmv.x.d x10, f15 + jal softaes_dec + sd x30, 48(x26) + fmv.d.x f6, x30 + sd x31, 56(x26) + fmv.d.x f7, x31 + /* restore clobbered registers */ + lwu x10, 80(x3) + lwu x11, 84(x3) + ld x12, 96(x3) + ld x13, 104(x3) + ld x14, 112(x3) + ld x15, 120(x3) + +DECL(randomx_riscv64_loop_end): + xor x26, x16, x18 /* JIT compiler will adjust the registers */ + /* load L3 mask */ + lwu x1, 88(x3) + addi x24, x24, -1 + srli x27, x26, 32 + /* set spAddr0, spAddr1 */ + and x26, x26, x1 + and x27, x27, x1 + add x26, x26, x5 + add x27, x27, x5 + /* align L3 mask */ + addi x1, x1, 56 + /* conditional branch doesn't have sufficient range */ + j condition_check +DECL(randomx_riscv64_fix_continue_loop): +continue_loop: + .word 0 /* JIT compiler will write a jump to loop_begin */ +condition_check: + bnez x24, continue_loop + +DECL(randomx_riscv64_epilogue): + /* restore callee saved registers */ + ld x10, 8(sp) + ld x1, 0(sp) + ld x3, 16(sp) + ld x8, 32(sp) + ld x9, 40(sp) + ld x24, 96(sp) + ld x25, 104(sp) + ld x26, 112(sp) + ld x27, 120(sp) + fld f18, 144(sp) + fld f19, 152(sp) + fld f20, 160(sp) + fld f21, 168(sp) + fld f22, 176(sp) + fld f23, 184(sp) + fld f24, 192(sp) + fld f25, 200(sp) + fld f26, 208(sp) + fld f27, 216(sp) + /* save VM registers */ + sd x16, 0(x10) + sd x17, 8(x10) + sd x18, 16(x10) + sd x19, 24(x10) + sd x20, 32(x10) + sd x21, 40(x10) + sd x22, 48(x10) + sd x23, 56(x10) + fsd f0, 64(x10) + fsd f1, 72(x10) + fsd f2, 80(x10) + fsd f3, 88(x10) + fsd f4, 96(x10) + fsd f5, 104(x10) + fsd f6, 112(x10) + fsd f7, 120(x10) + fsd f8, 128(x10) + fsd f9, 136(x10) + fsd f10, 144(x10) + fsd f11, 152(x10) + fsd f12, 160(x10) + fsd f13, 168(x10) + fsd f14, 176(x10) + fsd f15, 184(x10) + /* restore callee saved registers */ + ld x18, 48(sp) + ld x19, 56(sp) + ld x20, 64(sp) + ld x21, 72(sp) + ld x22, 80(sp) + ld x23, 88(sp) + fld f8, 128(sp) + fld f9, 136(sp) + /* restore stack pointer */ + addi sp, sp, 224 + /* return */ + ret + +/* + Soft AES subroutines + in: + x3 = literal pool + x8, x10 = round key + x30, x31 = plaintext + out: + x30, x31 = ciphertext + clobbers: + x8-x11 (limbs) + x12-x13 (LUTs) + x14-x15 (temp) +*/ +DECL(randomx_riscv64_softaes): +softaes_enc: + /* enc. lookup table */ + ld x13, 128(x3) + + /* load the round key into x8, x9, x10, x11 */ + srli x9, x8, 32 + srli x11, x10, 32 +#ifdef __riscv_zba + zext.w x8, x8 + zext.w x10, x10 +#else + slli x8, x8, 32 + slli x10, x10, 32 + srli x8, x8, 32 + srli x10, x10, 32 +#endif + + /* byte 0 */ + andi x14, x30, 255 + srli x30, x30, 8 + addi x12, x13, -2048 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, -2048(x14) + + /* byte 1 */ + andi x15, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x8, x8, x14 + + /* byte 2 */ + andi x14, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x11, x11, x15 + + /* byte 3 */ + andi x15, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x10, x10, x14 + + /* byte 4 */ + andi x14, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x9, x9, x15 + + /* byte 5 */ + andi x15, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x9, x9, x14 + + /* byte 6 */ + andi x14, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x8, x8, x15 + + /* byte 7 */ + andi x15, x30, 255 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x11, x11, x14 + + /* byte 8 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x10, x10, x15 + + /* byte 9 */ + andi x15, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x10, x10, x14 + + /* byte 10 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x9, x9, x15 + + /* byte 11 */ + andi x15, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x8, x8, x14 + + /* byte 12 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x11, x11, x15 + + /* byte 13 */ + andi x15, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x11, x11, x14 + + /* byte 14 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x10, x10, x15 + + /* byte 15 */ + andi x15, x31, 255 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x9, x9, x14 + + slli x11, x11, 32 + slli x9, x9, 32 + or x30, x8, x9 + or x31, x10, x11 + xor x30, x30, x15 + + ret + +softaes_dec: + /* dec. lookup table */ + ld x13, 136(x3) + + /* load the round key into x8, x9, x10, x11 */ + srli x9, x8, 32 + srli x11, x10, 32 +#ifdef __riscv_zba + zext.w x8, x8 + zext.w x10, x10 +#else + slli x8, x8, 32 + slli x10, x10, 32 + srli x8, x8, 32 + srli x10, x10, 32 +#endif + + /* byte 0 */ + andi x14, x30, 255 + srli x30, x30, 8 + addi x12, x13, -2048 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, -2048(x14) + + /* byte 1 */ + andi x15, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x8, x8, x14 + + /* byte 2 */ + andi x14, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x9, x9, x15 + + /* byte 3 */ + andi x15, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x10, x10, x14 + + /* byte 4 */ + andi x14, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x11, x11, x15 + + /* byte 5 */ + andi x15, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x9, x9, x14 + + /* byte 6 */ + andi x14, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x10, x10, x15 + + /* byte 7 */ + andi x15, x30, 255 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x11, x11, x14 + + /* byte 8 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x8, x8, x15 + + /* byte 9 */ + andi x15, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x10, x10, x14 + + /* byte 10 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x11, x11, x15 + + /* byte 11 */ + andi x15, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x8, x8, x14 + + /* byte 12 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x9, x9, x15 + + /* byte 13 */ + andi x15, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x11, x11, x14 + + /* byte 14 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x8, x8, x15 + + /* byte 15 */ + andi x15, x31, 255 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x9, x9, x14 + + slli x11, x11, 32 + slli x9, x9, 32 + or x30, x8, x9 + or x31, x10, x11 + xor x31, x31, x15 + + ret + +DECL(randomx_riscv64_program_end): + nop + + +/* literal pool for SuperscalarHash */ + /* space for remaining IMUL_RCP literals */ +ssh_literal_pool: + /* space for 256 IMUL_RCP literals */ + .fill 256,8,0 + +/* + SuperscalarHash subroutine + in: + x3 = literal pool + x6 = cache + x7 = itemNumber + out: + x8-x15 = 64-byte hash + clobbers: + x7, x28-x31 +*/ +DECL(randomx_riscv64_ssh_init): +superscalar_hash: + ld x30, 0(x3) /* superscalarMul0 */ + addi x8, x7, 1 + ld x9, 8(x3) + li x31, RANDOMX_CACHE_MASK + ld x10, 16(x3) + ld x11, 24(x3) + mul x8, x8, x30 + ld x12, 32(x3) + ld x13, 40(x3) + lla x30, ssh_literal_pool + ld x14, 48(x3) + and x7, x7, x31 + ld x15, 56(x3) + slli x7, x7, 6 + xor x9, x9, x8 + add x7, x7, x6 + xor x10, x10, x8 + /* load the first IMUL_RCP literal */ + ld x31, 2040(x30) + xor x11, x11, x8 + xor x12, x12, x8 + xor x13, x13, x8 + xor x14, x14, x8 + xor x15, x15, x8 + +DECL(randomx_riscv64_ssh_load): + ld x28, 0(x7) + ld x29, 8(x7) + xor x8, x8, x28 + ld x28, 16(x7) + xor x9, x9, x29 + ld x29, 24(x7) + xor x10, x10, x28 + ld x28, 32(x7) + xor x11, x11, x29 + ld x29, 40(x7) + xor x12, x12, x28 + ld x28, 48(x7) + xor x13, x13, x29 + ld x29, 56(x7) + xor x14, x14, x28 + li x7, RANDOMX_CACHE_MASK + xor x15, x15, x29 + +DECL(randomx_riscv64_ssh_prefetch): + and x7, x8, x7 /* JIT compiler will adjust the register */ + slli x7, x7, 6 + add x7, x7, x6 + /* prefetch - doesn't seem to have any effect */ + /* ld x0, 0(x7) */ + +DECL(randomx_riscv64_ssh_end): + nop diff --git a/src/RandomX/src/jit_compiler_rv64_static.hpp b/src/RandomX/src/jit_compiler_rv64_static.hpp new file mode 100644 index 000000000..656623c74 --- /dev/null +++ b/src/RandomX/src/jit_compiler_rv64_static.hpp @@ -0,0 +1,53 @@ +/* +Copyright (c) 2023 tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +extern "C" { + void randomx_riscv64_literals(); + void randomx_riscv64_literals_end(); + void randomx_riscv64_data_init(); + void randomx_riscv64_fix_data_call(); + void randomx_riscv64_prologue(); + void randomx_riscv64_loop_begin(); + void randomx_riscv64_data_read(); + void randomx_riscv64_data_read_light(); + void randomx_riscv64_fix_loop_call(); + void randomx_riscv64_spad_store(); + void randomx_riscv64_spad_store_hardaes(); + void randomx_riscv64_spad_store_softaes(); + void randomx_riscv64_loop_end(); + void randomx_riscv64_fix_continue_loop(); + void randomx_riscv64_epilogue(); + void randomx_riscv64_softaes(); + void randomx_riscv64_program_end(); + void randomx_riscv64_ssh_init(); + void randomx_riscv64_ssh_load(); + void randomx_riscv64_ssh_prefetch(); + void randomx_riscv64_ssh_end(); +} diff --git a/src/RandomX/src/jit_compiler_x86.cpp b/src/RandomX/src/jit_compiler_x86.cpp index 5587e6afb..785ce5f59 100644 --- a/src/RandomX/src/jit_compiler_x86.cpp +++ b/src/RandomX/src/jit_compiler_x86.cpp @@ -34,7 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "superscalar.hpp" #include "program.hpp" #include "reciprocal.h" -#include "virtual_memory.hpp" +#include "virtual_memory.h" namespace randomx { /* @@ -618,7 +618,7 @@ namespace randomx { } void JitCompilerX86::h_IMUL_RCP(Instruction& instr, int i) { - uint64_t divisor = instr.getImm32(); + const uint32_t divisor = instr.getImm32(); if (!isZeroOrPowerOf2(divisor)) { registerUsage[instr.dst] = i; emit(MOV_RAX_I); diff --git a/src/RandomX/src/randomx.cpp b/src/RandomX/src/randomx.cpp index 7daaa46df..a08968e6a 100644 --- a/src/RandomX/src/randomx.cpp +++ b/src/RandomX/src/randomx.cpp @@ -36,7 +36,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cpu.hpp" #include #include + +#if defined(__SSE__) || defined(__SSE2__) || (defined(_M_IX86_FP) && (_M_IX86_FP > 0)) +#define USE_CSR_INTRINSICS +#include +#else #include +#endif extern "C" { @@ -356,8 +362,14 @@ extern "C" { assert(machine != nullptr); assert(inputSize == 0 || input != nullptr); assert(output != nullptr); + +#ifdef USE_CSR_INTRINSICS + const unsigned int fpstate = _mm_getcsr(); +#else fenv_t fpstate; fegetenv(&fpstate); +#endif + alignas(16) uint64_t tempHash[8]; int blakeResult = blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0); assert(blakeResult == 0); @@ -370,7 +382,12 @@ extern "C" { } machine->run(&tempHash); machine->getFinalResult(output, RANDOMX_HASH_SIZE); + +#ifdef USE_CSR_INTRINSICS + _mm_setcsr(fpstate); +#else fesetenv(&fpstate); +#endif } void randomx_calculate_hash_first(randomx_vm* machine, const void* input, size_t inputSize) { @@ -400,4 +417,15 @@ extern "C" { machine->run(machine->tempHash); machine->getFinalResult(output, RANDOMX_HASH_SIZE); } + + void randomx_calculate_commitment(const void* input, size_t inputSize, const void* hash_in, void* com_out) { + assert(inputSize == 0 || input != nullptr); + assert(hash_in != nullptr); + assert(com_out != nullptr); + blake2b_state state; + blake2b_init(&state, RANDOMX_HASH_SIZE); + blake2b_update(&state, input, inputSize); + blake2b_update(&state, hash_in, RANDOMX_HASH_SIZE); + blake2b_final(&state, com_out, RANDOMX_HASH_SIZE); + } } diff --git a/src/RandomX/src/randomx.h b/src/RandomX/src/randomx.h index 64d18068b..313bcd2e0 100644 --- a/src/RandomX/src/randomx.h +++ b/src/RandomX/src/randomx.h @@ -260,6 +260,17 @@ RANDOMX_EXPORT void randomx_calculate_hash_first(randomx_vm* machine, const void RANDOMX_EXPORT void randomx_calculate_hash_next(randomx_vm* machine, const void* nextInput, size_t nextInputSize, void* output); RANDOMX_EXPORT void randomx_calculate_hash_last(randomx_vm* machine, void* output); +/** + * Calculate a RandomX commitment from a RandomX hash and its input. + * + * @param input is a pointer to memory that was hashed. Must not be NULL. + * @param inputSize is the number of bytes in the input. + * @param hash_in is the output from randomx_calculate_hash* (RANDOMX_HASH_SIZE bytes). + * @param com_out is a pointer to memory where the commitment will be stored. Must not + * be NULL and at least RANDOMX_HASH_SIZE bytes must be available for writing. +*/ +RANDOMX_EXPORT void randomx_calculate_commitment(const void* input, size_t inputSize, const void* hash_in, void* com_out); + #if defined(__cplusplus) } #endif diff --git a/src/RandomX/src/reciprocal.c b/src/RandomX/src/reciprocal.c index 22620f53a..074d1846b 100644 --- a/src/RandomX/src/reciprocal.c +++ b/src/RandomX/src/reciprocal.c @@ -44,36 +44,28 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ret */ -uint64_t randomx_reciprocal(uint64_t divisor) { +uint64_t randomx_reciprocal(uint32_t divisor) { assert(divisor != 0); const uint64_t p2exp63 = 1ULL << 63; + const uint64_t q = p2exp63 / divisor; + const uint64_t r = p2exp63 % divisor; + +#ifdef __GNUC__ + const uint32_t shift = 64 - __builtin_clzll(divisor); +#else + uint32_t shift = 32; + for (uint32_t k = 1U << 31; (k & divisor) == 0; k >>= 1) + --shift; +#endif - uint64_t quotient = p2exp63 / divisor, remainder = p2exp63 % divisor; - - unsigned bsr = 0; //highest set bit in divisor - - for (uint64_t bit = divisor; bit > 0; bit >>= 1) - bsr++; - - for (unsigned shift = 0; shift < bsr; shift++) { - if (remainder >= divisor - remainder) { - quotient = quotient * 2 + 1; - remainder = remainder * 2 - divisor; - } - else { - quotient = quotient * 2; - remainder = remainder * 2; - } - } - - return quotient; + return (q << shift) + ((r << shift) / divisor); } #if !RANDOMX_HAVE_FAST_RECIPROCAL -uint64_t randomx_reciprocal_fast(uint64_t divisor) { +uint64_t randomx_reciprocal_fast(uint32_t divisor) { return randomx_reciprocal(divisor); } diff --git a/src/RandomX/src/reciprocal.h b/src/RandomX/src/reciprocal.h index 8858df2b8..90bd9b6be 100644 --- a/src/RandomX/src/reciprocal.h +++ b/src/RandomX/src/reciprocal.h @@ -40,8 +40,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. extern "C" { #endif -uint64_t randomx_reciprocal(uint64_t); -uint64_t randomx_reciprocal_fast(uint64_t); +uint64_t randomx_reciprocal(uint32_t); +uint64_t randomx_reciprocal_fast(uint32_t); #if defined(__cplusplus) } diff --git a/src/RandomX/src/tests/benchmark.cpp b/src/RandomX/src/tests/benchmark.cpp index 36b0259b6..148521a51 100644 --- a/src/RandomX/src/tests/benchmark.cpp +++ b/src/RandomX/src/tests/benchmark.cpp @@ -96,6 +96,7 @@ void printUsage(const char* executable) { std::cout << " --avx2 use optimized Argon2 for AVX2 CPUs" << std::endl; std::cout << " --auto select the best options for the current CPU" << std::endl; std::cout << " --noBatch calculate hashes one by one (default: batch)" << std::endl; + std::cout << " --commit calculate commitments instead of hashes (default: hashes)" << std::endl; } struct MemoryException : public std::exception { @@ -113,7 +114,7 @@ struct DatasetAllocException : public MemoryException { using MineFunc = void(randomx_vm * vm, std::atomic & atomicNonce, AtomicHash & result, uint32_t noncesCount, int thread, int cpuid); -template +template void mine(randomx_vm* vm, std::atomic& atomicNonce, AtomicHash& result, uint32_t noncesCount, int thread, int cpuid = -1) { if (cpuid >= 0) { int rc = set_thread_affinity(cpuid); @@ -138,6 +139,9 @@ void mine(randomx_vm* vm, std::atomic& atomicNonce, AtomicHash& result } store32(noncePtr, nonce); (batch ? randomx_calculate_hash_next : randomx_calculate_hash)(vm, blockTemplate, sizeof(blockTemplate), &hash); + if (commit) { + randomx_calculate_commitment(blockTemplate, sizeof(blockTemplate), &hash, &hash); + } result.xorWith(hash); if (!batch) { nonce = atomicNonce.fetch_add(1); @@ -146,7 +150,7 @@ void mine(randomx_vm* vm, std::atomic& atomicNonce, AtomicHash& result } int main(int argc, char** argv) { - bool softAes, miningMode, verificationMode, help, largePages, jit, secure; + bool softAes, miningMode, verificationMode, help, largePages, jit, secure, commit; bool ssse3, avx2, autoFlags, noBatch; int noncesCount, threadCount, initThreadCount; uint64_t threadAffinity; @@ -172,10 +176,11 @@ int main(int argc, char** argv) { readOption("--avx2", argc, argv, avx2); readOption("--auto", argc, argv, autoFlags); readOption("--noBatch", argc, argv, noBatch); + readOption("--commit", argc, argv, commit); store32(&seed, seedValue); - std::cout << "RandomX benchmark v1.1.11" << std::endl; + std::cout << "RandomX benchmark v1.2.1" << std::endl; if (help) { printUsage(argv[0]); @@ -280,11 +285,24 @@ int main(int argc, char** argv) { MineFunc* func; if (noBatch) { - func = &mine; + if (commit) { + std::cout << " - hash commitments" << std::endl; + func = &mine; + } + else { + func = &mine; + } } else { - func = &mine; - std::cout << " - batch mode" << std::endl; + if (commit) { + //TODO: support batch mode with commitments + std::cout << " - hash commitments" << std::endl; + func = &mine; + } + else { + std::cout << " - batch mode" << std::endl; + func = &mine; + } } std::cout << "Initializing"; @@ -376,7 +394,7 @@ int main(int argc, char** argv) { randomx_release_cache(cache); std::cout << "Calculated result: "; result.print(std::cout); - if (noncesCount == 1000 && seedValue == 0) + if (noncesCount == 1000 && seedValue == 0 && !commit) std::cout << "Reference result: 10b649a3f15c7c7f88277812f2e74b337a0f20ce909af09199cccb960771cfa1" << std::endl; if (!miningMode) { std::cout << "Performance: " << 1000 * elapsed / noncesCount << " ms per hash" << std::endl; diff --git a/src/RandomX/src/tests/perf-simulation.cpp b/src/RandomX/src/tests/perf-simulation.cpp index 1068a40ef..27f34d8c4 100644 --- a/src/RandomX/src/tests/perf-simulation.cpp +++ b/src/RandomX/src/tests/perf-simulation.cpp @@ -477,7 +477,7 @@ int analyze(randomx::Program& p) { } if (opcode < randomx::ceil_IMUL_RCP) { - uint64_t divisor = instr.getImm32(); + const uint32_t divisor = instr.getImm32(); if (!randomx::isZeroOrPowerOf2(divisor)) { instr.dst = instr.dst % randomx::RegistersCount; instr.opcode |= DST_INT; diff --git a/src/RandomX/src/tests/riscv64_zba.s b/src/RandomX/src/tests/riscv64_zba.s new file mode 100644 index 000000000..e1947e7a6 --- /dev/null +++ b/src/RandomX/src/tests/riscv64_zba.s @@ -0,0 +1,9 @@ +/* RISC-V - test if the Zba extension is present */ + +.text +.global main + +main: + sh1add x6, x6, x7 + li x10, 0 + ret diff --git a/src/RandomX/src/tests/riscv64_zbb.s b/src/RandomX/src/tests/riscv64_zbb.s new file mode 100644 index 000000000..d922043f0 --- /dev/null +++ b/src/RandomX/src/tests/riscv64_zbb.s @@ -0,0 +1,9 @@ +/* RISC-V - test if the Zbb extension is present */ + +.text +.global main + +main: + ror x6, x6, x7 + li x10, 0 + ret diff --git a/src/RandomX/src/tests/tests.cpp b/src/RandomX/src/tests/tests.cpp index 412585b1d..5e1b41a38 100644 --- a/src/RandomX/src/tests/tests.cpp +++ b/src/RandomX/src/tests/tests.cpp @@ -34,6 +34,14 @@ void calcStringHash(const char(&key)[K], const char(&input)[H], void* output) { randomx_calculate_hash(vm, input, H - 1, output); } +template +void calcStringCommitment(const char(&key)[K], const char(&input)[H], void* output) { + initCache(key); + assert(vm != nullptr); + randomx_calculate_hash(vm, input, H - 1, output); + randomx_calculate_commitment(input, H - 1, output, output); +} + template void calcHexHash(const char(&key)[K], const char(&hex)[H], void* output) { initCache(key); @@ -1082,6 +1090,22 @@ int main() { assert(rx_get_rounding_mode() == RoundToNearest); }); + if (RANDOMX_HAVE_COMPILER) { + randomx_destroy_vm(vm); + vm = nullptr; +#ifdef RANDOMX_FORCE_SECURE + vm = randomx_create_vm(RANDOMX_FLAG_DEFAULT | RANDOMX_FLAG_SECURE, cache, nullptr); +#else + vm = randomx_create_vm(RANDOMX_FLAG_DEFAULT, cache, nullptr); +#endif + } + + runTest("Commitment test", stringsEqual(RANDOMX_ARGON_SALT, "RandomX\x03"), []() { + char hash[RANDOMX_HASH_SIZE]; + calcStringCommitment("test key 000", "This is a test", &hash); + assert(equalsHex(hash, "d53ccf348b75291b7be76f0a7ac8208bbced734b912f6fca60539ab6f86be919")); + }); + randomx_destroy_vm(vm); vm = nullptr; diff --git a/src/RandomX/src/virtual_memory.cpp b/src/RandomX/src/virtual_memory.c similarity index 54% rename from src/RandomX/src/virtual_memory.cpp rename to src/RandomX/src/virtual_memory.c index 248d3a2c4..d2cdcda0f 100644 --- a/src/RandomX/src/virtual_memory.cpp +++ b/src/RandomX/src/virtual_memory.c @@ -26,28 +26,24 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "virtual_memory.hpp" - -#include - #if defined(_WIN32) || defined(__CYGWIN__) #include #else +#define _GNU_SOURCE 1 /* needed for MAP_ANONYMOUS on older platforms */ #ifdef __APPLE__ #include #include #include # if TARGET_OS_OSX -# if TARGET_CPU_ARM64 -# define USE_PTHREAD_JIT_WP 1 -# else -# undef USE_PTHREAD_JIT_WP -# endif +# define USE_PTHREAD_JIT_WP 1 # include +# include +# include # endif #endif #include #include +#include #ifndef MAP_ANONYMOUS #define MAP_ANONYMOUS MAP_ANON #endif @@ -57,27 +53,50 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define PAGE_EXECUTE_READWRITE (PROT_READ | PROT_WRITE | PROT_EXEC) #endif -#if defined(_WIN32) || defined(__CYGWIN__) -std::string getErrorMessage(const char* function) { - LPSTR messageBuffer = nullptr; - size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, - NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&messageBuffer, 0, NULL); - std::string message(messageBuffer, size); - LocalFree(messageBuffer); - return std::string(function) + std::string(": ") + message; +#include "virtual_memory.h" + +#if defined(USE_PTHREAD_JIT_WP) && defined(MAC_OS_VERSION_11_0) \ + && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_VERSION_11_0 +static int MacOSchecked, MacOSver; +/* This function is used implicitly by clang's __builtin_available() checker. + * When cross-compiling, the library containing this function doesn't exist, + * and linking will fail because the symbol is unresolved. The function here + * is a quick and dirty hack to get close enough to identify MacOSX 11.0. + */ +static int32_t __isOSVersionAtLeast(int32_t major, int32_t minor, int32_t subminor) { + if (!MacOSchecked) { + struct utsname ut; + int mmaj, mmin; + uname(&ut); + sscanf(ut.release, "%d.%d", &mmaj, &mmin); + // The utsname release version is 9 greater than the canonical OS version + mmaj -= 9; + MacOSver = (mmaj << 8) | mmin; + MacOSchecked = 1; + } + return MacOSver >= ((major << 8) | minor); } +#endif + -void setPrivilege(const char* pszPrivilege, BOOL bEnable) { +#if defined(_WIN32) || defined(__CYGWIN__) +#define Fail(func) do {*errfunc = func; return GetLastError();} while(0) +int setPrivilege(const char* pszPrivilege, BOOL bEnable, char **errfunc) { HANDLE hToken; TOKEN_PRIVILEGES tp; BOOL status; - DWORD error; + DWORD error = 0; + + *errfunc = NULL; if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hToken)) - throw std::runtime_error(getErrorMessage("OpenProcessToken")); + Fail("OpenProcessToken"); - if (!LookupPrivilegeValue(NULL, pszPrivilege, &tp.Privileges[0].Luid)) - throw std::runtime_error(getErrorMessage("LookupPrivilegeValue")); + if (!LookupPrivilegeValue(NULL, pszPrivilege, &tp.Privileges[0].Luid)) { + *errfunc = "LookupPrivilegeValue"; + error = GetLastError(); + goto out; + } tp.PrivilegeCount = 1; @@ -89,20 +108,28 @@ void setPrivilege(const char* pszPrivilege, BOOL bEnable) { status = AdjustTokenPrivileges(hToken, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0); error = GetLastError(); - if (!status || (error != ERROR_SUCCESS)) - throw std::runtime_error(getErrorMessage("AdjustTokenPrivileges")); + if (!status || (error != ERROR_SUCCESS)) { + *errfunc = "AdjustTokenPrivileges"; + goto out; + } - if (!CloseHandle(hToken)) - throw std::runtime_error(getErrorMessage("CloseHandle")); +out: + if (!CloseHandle(hToken)) { + if (*errfunc == NULL) { + *errfunc = "CloseHandle"; + error = GetLastError(); + } + } + return error; } +#else +#define Fail(func) do {*errfunc = func; return errno;} while(0) #endif -void* allocMemoryPages(std::size_t bytes) { +void* allocMemoryPages(size_t bytes) { void* mem; #if defined(_WIN32) || defined(__CYGWIN__) - mem = VirtualAlloc(nullptr, bytes, MEM_COMMIT, PAGE_READWRITE); - if (mem == nullptr) - throw std::runtime_error(getErrorMessage("allocMemoryPages - VirtualAlloc")); + mem = VirtualAlloc(NULL, bytes, MEM_COMMIT, PAGE_READWRITE); #else #if defined(__NetBSD__) #define RESERVED_FLAGS PROT_MPROTECT(PROT_EXEC) @@ -116,89 +143,95 @@ void* allocMemoryPages(std::size_t bytes) { #define MEXTRA 0 #define PEXTRA 0 #endif - mem = mmap(nullptr, bytes, PAGE_READWRITE | RESERVED_FLAGS | PEXTRA, MAP_ANONYMOUS | MAP_PRIVATE | MEXTRA, -1, 0); + mem = mmap(NULL, bytes, PAGE_READWRITE | RESERVED_FLAGS | PEXTRA, MAP_ANONYMOUS | MAP_PRIVATE | MEXTRA, -1, 0); if (mem == MAP_FAILED) - throw std::runtime_error("allocMemoryPages - mmap failed"); + mem = NULL; #if defined(USE_PTHREAD_JIT_WP) && defined(MAC_OS_VERSION_11_0) \ && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_VERSION_11_0 if (__builtin_available(macOS 11.0, *)) { - pthread_jit_write_protect_np(false); + pthread_jit_write_protect_np(0); } #endif #endif return mem; } -static inline void pageProtect(void* ptr, std::size_t bytes, int rules) { +static inline int pageProtect(void* ptr, size_t bytes, int rules, char **errfunc) { #if defined(_WIN32) || defined(__CYGWIN__) DWORD oldp; if (!VirtualProtect(ptr, bytes, (DWORD)rules, &oldp)) { - throw std::runtime_error(getErrorMessage("VirtualProtect")); + Fail("VirtualProtect"); } #else if (-1 == mprotect(ptr, bytes, rules)) - throw std::runtime_error("mprotect failed"); + Fail("mprotect"); #endif + return 0; } -void setPagesRW(void* ptr, std::size_t bytes) { +void setPagesRW(void* ptr, size_t bytes) { + char *errfunc; #if defined(USE_PTHREAD_JIT_WP) && defined(MAC_OS_VERSION_11_0) \ && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_VERSION_11_0 if (__builtin_available(macOS 11.0, *)) { - pthread_jit_write_protect_np(false); + pthread_jit_write_protect_np(0); } else { - pageProtect(ptr, bytes, PAGE_READWRITE); + pageProtect(ptr, bytes, PAGE_READWRITE, &errfunc); } #else - pageProtect(ptr, bytes, PAGE_READWRITE); + pageProtect(ptr, bytes, PAGE_READWRITE, &errfunc); #endif } -void setPagesRX(void* ptr, std::size_t bytes) { +void setPagesRX(void* ptr, size_t bytes) { + char *errfunc; #if defined(USE_PTHREAD_JIT_WP) && defined(MAC_OS_VERSION_11_0) \ && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_VERSION_11_0 if (__builtin_available(macOS 11.0, *)) { - pthread_jit_write_protect_np(true); + pthread_jit_write_protect_np(1); + __builtin___clear_cache((char*)ptr, ((char*)ptr) + bytes); } else { - pageProtect(ptr, bytes, PAGE_EXECUTE_READ); + pageProtect(ptr, bytes, PAGE_EXECUTE_READ, &errfunc); } #else - pageProtect(ptr, bytes, PAGE_EXECUTE_READ); + pageProtect(ptr, bytes, PAGE_EXECUTE_READ, &errfunc); #endif } -void setPagesRWX(void* ptr, std::size_t bytes) { - pageProtect(ptr, bytes, PAGE_EXECUTE_READWRITE); +void setPagesRWX(void* ptr, size_t bytes) { + char *errfunc; + pageProtect(ptr, bytes, PAGE_EXECUTE_READWRITE, &errfunc); } -void* allocLargePagesMemory(std::size_t bytes) { +void* allocLargePagesMemory(size_t bytes) { void* mem; + char *errfunc; #if defined(_WIN32) || defined(__CYGWIN__) - setPrivilege("SeLockMemoryPrivilege", 1); - auto pageMinimum = GetLargePageMinimum(); - if (pageMinimum > 0) - mem = VirtualAlloc(NULL, alignSize(bytes, pageMinimum), MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES, PAGE_READWRITE); - else - throw std::runtime_error("allocLargePagesMemory - Large pages are not supported"); - if (mem == nullptr) - throw std::runtime_error(getErrorMessage("allocLargePagesMemory - VirtualAlloc")); + if (setPrivilege("SeLockMemoryPrivilege", 1, &errfunc)) + return NULL; + size_t pageMinimum = GetLargePageMinimum(); + if (!pageMinimum) { + errfunc = "No large pages"; + return NULL; + } + mem = VirtualAlloc(NULL, alignSize(bytes, pageMinimum), MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES, PAGE_READWRITE); #else #ifdef __APPLE__ - mem = mmap(nullptr, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0); + mem = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0); #elif defined(__FreeBSD__) - mem = mmap(nullptr, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER, -1, 0); + mem = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER, -1, 0); #elif defined(__OpenBSD__) || defined(__NetBSD__) mem = MAP_FAILED; // OpenBSD does not support huge pages #else - mem = mmap(nullptr, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, -1, 0); + mem = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, -1, 0); #endif if (mem == MAP_FAILED) - throw std::runtime_error("allocLargePagesMemory - mmap failed"); + mem = NULL; #endif return mem; } -void freePagedMemory(void* ptr, std::size_t bytes) { +void freePagedMemory(void* ptr, size_t bytes) { #if defined(_WIN32) || defined(__CYGWIN__) VirtualFree(ptr, 0, MEM_RELEASE); #else diff --git a/src/RandomX/src/virtual_memory.hpp b/src/RandomX/src/virtual_memory.h similarity index 80% rename from src/RandomX/src/virtual_memory.hpp rename to src/RandomX/src/virtual_memory.h index 9e8bc29ab..5e8e31d53 100644 --- a/src/RandomX/src/virtual_memory.hpp +++ b/src/RandomX/src/virtual_memory.h @@ -28,15 +28,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma once -#include +#ifdef __cplusplus +extern "C" { +#endif -constexpr std::size_t alignSize(std::size_t pos, std::size_t align) { - return ((pos - 1) / align + 1) * align; -} +#include + +#define alignSize(pos, align) (((pos - 1) / align + 1) * align) -void* allocMemoryPages(std::size_t); -void setPagesRW(void*, std::size_t); -void setPagesRX(void*, std::size_t); -void setPagesRWX(void*, std::size_t); -void* allocLargePagesMemory(std::size_t); -void freePagedMemory(void*, std::size_t); +void* allocMemoryPages(size_t); +void setPagesRW(void*, size_t); +void setPagesRX(void*, size_t); +void setPagesRWX(void*, size_t); +void* allocLargePagesMemory(size_t); +void freePagedMemory(void*, size_t); + +#ifdef __cplusplus +} +#endif diff --git a/src/RandomX/vcxproj/randomx-dll.vcxproj b/src/RandomX/vcxproj/randomx-dll.vcxproj index 8b8ea8c08..4eaae9bed 100644 --- a/src/RandomX/vcxproj/randomx-dll.vcxproj +++ b/src/RandomX/vcxproj/randomx-dll.vcxproj @@ -43,7 +43,7 @@ - + @@ -74,7 +74,7 @@ - + diff --git a/src/RandomX/vcxproj/randomx-dll.vcxproj.filters b/src/RandomX/vcxproj/randomx-dll.vcxproj.filters index 68e1b8559..5b51f9f72 100644 --- a/src/RandomX/vcxproj/randomx-dll.vcxproj.filters +++ b/src/RandomX/vcxproj/randomx-dll.vcxproj.filters @@ -87,7 +87,7 @@ Header Files - + Header Files @@ -151,7 +151,7 @@ Source Files - + Source Files diff --git a/src/RandomX/vcxproj/randomx.vcxproj b/src/RandomX/vcxproj/randomx.vcxproj index e0625c88b..cefdc8fb3 100644 --- a/src/RandomX/vcxproj/randomx.vcxproj +++ b/src/RandomX/vcxproj/randomx.vcxproj @@ -156,7 +156,7 @@ SET ERRORLEVEL = 0 - + @@ -198,7 +198,7 @@ SET ERRORLEVEL = 0 - + diff --git a/src/RandomX/vcxproj/randomx.vcxproj.filters b/src/RandomX/vcxproj/randomx.vcxproj.filters index eb4462a59..7f055b5b8 100644 --- a/src/RandomX/vcxproj/randomx.vcxproj.filters +++ b/src/RandomX/vcxproj/randomx.vcxproj.filters @@ -72,7 +72,7 @@ Source Files - + Source Files @@ -164,7 +164,7 @@ Header Files - + Header Files From 1f5fb24985becdbb3c1b2dbdb59a3fd4b3a8ee35 Mon Sep 17 00:00:00 2001 From: Duke Date: Tue, 7 Nov 2023 09:32:45 -0500 Subject: [PATCH 08/15] Revert to our custom RandomX config options --- src/RandomX/src/configuration.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/RandomX/src/configuration.h b/src/RandomX/src/configuration.h index 84400ddce..f74a74a4c 100644 --- a/src/RandomX/src/configuration.h +++ b/src/RandomX/src/configuration.h @@ -32,13 +32,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define RANDOMX_ARGON_MEMORY 262144 //Number of Argon2d iterations for Cache initialization. -#define RANDOMX_ARGON_ITERATIONS 3 +#define RANDOMX_ARGON_ITERATIONS 5 //Number of parallel lanes for Cache initialization. #define RANDOMX_ARGON_LANES 1 //Argon2d salt -#define RANDOMX_ARGON_SALT "RandomX\x03" +#define RANDOMX_ARGON_SALT "RandomXHUSH\x03" //Number of random Cache accesses per Dataset item. Minimum is 2. #define RANDOMX_CACHE_ACCESSES 8 @@ -53,13 +53,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define RANDOMX_DATASET_EXTRA_SIZE 33554368 //Number of instructions in a RandomX program. Must be divisible by 8. -#define RANDOMX_PROGRAM_SIZE 256 +#define RANDOMX_PROGRAM_SIZE 512 //Number of iterations during VM execution. -#define RANDOMX_PROGRAM_ITERATIONS 2048 +#define RANDOMX_PROGRAM_ITERATIONS 4096 //Number of chained VM executions per hash. -#define RANDOMX_PROGRAM_COUNT 8 +#define RANDOMX_PROGRAM_COUNT 16 //Scratchpad L3 size in bytes. Must be a power of 2. #define RANDOMX_SCRATCHPAD_L3 2097152 From 0ed63ecdad40b812a312cd2582427ac2ab0abb9f Mon Sep 17 00:00:00 2001 From: duke Date: Sun, 12 Nov 2023 04:27:28 +0000 Subject: [PATCH 09/15] Update 'doc/release-process.md' --- doc/release-process.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/release-process.md b/doc/release-process.md index 08cedae03..c25792fbf 100644 --- a/doc/release-process.md +++ b/doc/release-process.md @@ -124,3 +124,9 @@ Install deps on Linux: Use `./util/build-mac.sh` to compile on Apple/Mac systems, use `./util/build-win.sh` to build on Windows and `./util/build-arm.sh` to build on ARMv8 systems. Use `./util/build-debian-package.sh aarch64` to build a Debian package for aarch64 . + +## Optional things + +### Updating RandomX + +If you need to update the source code of our in tree copy of RandomX, see issue https://git.hush.is/hush/hush3/issues/337#issuecomment-5114 for details. Currently we use RandomX v1.2.1 from the official repo at https://github.com/tevador/RandomX/releases/tag/v1.2.1 \ No newline at end of file From 034002f2e9b00441d72d0feae4bf43796137fccd Mon Sep 17 00:00:00 2001 From: duke Date: Sun, 12 Nov 2023 05:23:34 +0000 Subject: [PATCH 10/15] Update 'doc/relnotes/README.md' --- doc/relnotes/README.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/doc/relnotes/README.md b/doc/relnotes/README.md index 294f3d299..13e96074d 100644 --- a/doc/relnotes/README.md +++ b/doc/relnotes/README.md @@ -10,6 +10,32 @@ and no longer on Github, since they banned Duke Leto and also because they censor many people around the world and work with evil organizations. +# Hush 3.10.0 "" + +This is a MANDATORY release for Hush and nodes must upgrade by block height X, which will happen +on approximately Y. This is an OPTIONAL release for DragonX but it is highly recommended for +exchanges, solo miners and mining pools to update to this release. + + + * Hush and all Hush Smart Chains now use less RAM https://git.hush.is/hush/hush3/issues/283 + * Hush full nodes will use ~2GB less RAM + * DragonX full nodes will use ~30MB less RAM + * Antispam defenses + * Hush and all Hush Smart Chains now make it harder and more expensive for an attacker to send shielded spam. This raises the cost in CPU and transaction fees for Denial-of-Service attacks. + * New RPC `z_getstats` which reports data about numer of shielded inputs (zins) and shield outputs (zouts) in transactions. + * Fix a bug where `hush-cli stop` would not stop the node during the "Building Witnesses" rescan phase + * Fix bugs where `abortrescan` couldn't be used when node is start up (RPC warmup) and where it could not abort the rescan if it was in the "Building Witnesses" phase + * Upgraded curl to 8.4.0 + * DragonX specific changes: + * Updated to latest RandomX v1.2.1 which includes mining optimizations + * Fix RandomX mining memory leak + * This avoids an out-of-memory crash when miners change the number of threads of mining + * Fixing this bug lead to a 10% hashrate increase vs the previous release + * Fixed quoting bugs with dragonx-cli script + * For instance, many RPCs such as `dragonx z_sendmany ...` would not previously work because the arguments to the RPC were not quoted correctly. + + + # Hush 3.9.4 "Maniacal Manticore" ``` From 51aa8e1afd6849985ce5a0cfb90b411cf966422e Mon Sep 17 00:00:00 2001 From: duke Date: Sun, 12 Nov 2023 05:29:55 +0000 Subject: [PATCH 11/15] Update 'doc/relnotes/README.md' --- doc/relnotes/README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/relnotes/README.md b/doc/relnotes/README.md index 13e96074d..a8b3d4b04 100644 --- a/doc/relnotes/README.md +++ b/doc/relnotes/README.md @@ -21,13 +21,14 @@ exchanges, solo miners and mining pools to update to this release. * Hush full nodes will use ~2GB less RAM * DragonX full nodes will use ~30MB less RAM * Antispam defenses - * Hush and all Hush Smart Chains now make it harder and more expensive for an attacker to send shielded spam. This raises the cost in CPU and transaction fees for Denial-of-Service attacks. + * Hush and all Hush Smart Chains now make it harder and more expensive for an attacker to send shielded spam. This raises the cost in CPU https://git.hush.is/hush/hush3/commit/14d3ae17851615a69c33cb7eed623b904b140e3d and transaction fees https://git.hush.is/hush/hush3/commit/2308db22eec78d0a10bde0f674243b2700d59e4a for Denial-of-Service attacks. * New RPC `z_getstats` which reports data about numer of shielded inputs (zins) and shield outputs (zouts) in transactions. * Fix a bug where `hush-cli stop` would not stop the node during the "Building Witnesses" rescan phase * Fix bugs where `abortrescan` couldn't be used when node is start up (RPC warmup) and where it could not abort the rescan if it was in the "Building Witnesses" phase + * Fix bug in `z_mergetoaddress` where docs said you could use `ANY_ZADDR` but you couldn't https://git.hush.is/hush/hush3/commit/7eb9d75b94469c3fc8c028f29b35be9ac764a10c * Upgraded curl to 8.4.0 * DragonX specific changes: - * Updated to latest RandomX v1.2.1 which includes mining optimizations + * Updated to latest RandomX v1.2.1 which includes mining optimizations https://git.hush.is/hush/hush3/commit/6029b3d571009991ae9c4aea0397f4d00be6a817 https://git.hush.is/hush/hush3/issues/337 * Fix RandomX mining memory leak * This avoids an out-of-memory crash when miners change the number of threads of mining * Fixing this bug lead to a 10% hashrate increase vs the previous release From 2d638e5fccbde9c3d3981411304b94989db1027e Mon Sep 17 00:00:00 2001 From: duke Date: Sun, 12 Nov 2023 05:45:21 +0000 Subject: [PATCH 12/15] Update 'doc/relnotes/README.md' --- doc/relnotes/README.md | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/doc/relnotes/README.md b/doc/relnotes/README.md index a8b3d4b04..8c5f45b7d 100644 --- a/doc/relnotes/README.md +++ b/doc/relnotes/README.md @@ -22,15 +22,17 @@ exchanges, solo miners and mining pools to update to this release. * DragonX full nodes will use ~30MB less RAM * Antispam defenses * Hush and all Hush Smart Chains now make it harder and more expensive for an attacker to send shielded spam. This raises the cost in CPU https://git.hush.is/hush/hush3/commit/14d3ae17851615a69c33cb7eed623b904b140e3d and transaction fees https://git.hush.is/hush/hush3/commit/2308db22eec78d0a10bde0f674243b2700d59e4a for Denial-of-Service attacks. - * New RPC `z_getstats` which reports data about numer of shielded inputs (zins) and shield outputs (zouts) in transactions. - * Fix a bug where `hush-cli stop` would not stop the node during the "Building Witnesses" rescan phase - * Fix bugs where `abortrescan` couldn't be used when node is start up (RPC warmup) and where it could not abort the rescan if it was in the "Building Witnesses" phase + * New RPC `z_getstats` which reports data about numer of shielded inputs (zins) and shielded outputs (zouts) in transactions. https://git.hush.is/hush/hush3/commit/96ae2d61ca5a392cb476da4c7f6ab1f638839a7f + * Fix a bug where `hush-cli stop` would not stop the node during the "Building Witnesses" rescan phase https://git.hush.is/hush/hush3/issues/330 + * Fix bugs where `abortrescan` couldn't be used when node is start up (RPC warmup) and where it could not abort the rescan if it was in the "Building Witnesses" phase https://git.hush.is/hush/hush3/issues/331 * Fix bug in `z_mergetoaddress` where docs said you could use `ANY_ZADDR` but you couldn't https://git.hush.is/hush/hush3/commit/7eb9d75b94469c3fc8c028f29b35be9ac764a10c - * Upgraded curl to 8.4.0 + * Upgraded curl to 8.4.0 https://git.hush.is/hush/hush3/issues/325 + * This fixes CVE-2023-38545 which affects very few or potentially no Hush/DragonX users. It could only affect people who compile Hush full node software (not those who use binaries or packages) and who use a malicious SOCKS5 proxy for all network traffic via the operating system. * DragonX specific changes: * Updated to latest RandomX v1.2.1 which includes mining optimizations https://git.hush.is/hush/hush3/commit/6029b3d571009991ae9c4aea0397f4d00be6a817 https://git.hush.is/hush/hush3/issues/337 - * Fix RandomX mining memory leak - * This avoids an out-of-memory crash when miners change the number of threads of mining + * Fix RandomX mining memory leak and crash https://git.hush.is/hush/hush3/issues/324 + * This fixes the bug where stopping mining, making a transaction and then starting mining again + * This also avoids an out-of-memory crash when miners change the number of threads of mining * Fixing this bug lead to a 10% hashrate increase vs the previous release * Fixed quoting bugs with dragonx-cli script * For instance, many RPCs such as `dragonx z_sendmany ...` would not previously work because the arguments to the RPC were not quoted correctly. From 4570277b5262f6f5b65d51e6020df46a65e10f4e Mon Sep 17 00:00:00 2001 From: Duke Date: Sun, 12 Nov 2023 14:30:52 -0500 Subject: [PATCH 13/15] Bump protocol version --- src/version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/version.h b/src/version.h index 2f44a9d9d..504e38f4a 100644 --- a/src/version.h +++ b/src/version.h @@ -21,7 +21,7 @@ #define HUSH_VERSION_H // network protocol versioning -static const int PROTOCOL_VERSION = 1987425; +static const int PROTOCOL_VERSION = 1987426; //! initial proto version, to be increased after version/verack negotiation static const int INIT_PROTO_VERSION = 209; //! In this version, 'getheaders' was introduced. From 06876b1cc9c51c13c510f174107622312b0567d3 Mon Sep 17 00:00:00 2001 From: Duke Date: Mon, 13 Nov 2023 08:40:04 -0500 Subject: [PATCH 14/15] Bump version to 3.10.0 --- configure.ac | 4 ++-- src/clientversion.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/configure.ac b/configure.ac index 728795f41..a972c327e 100644 --- a/configure.ac +++ b/configure.ac @@ -2,8 +2,8 @@ dnl require autoconf 2.60 (AS_ECHO/AS_ECHO_N) AC_PREREQ([2.60]) define(_CLIENT_VERSION_MAJOR, 3) dnl Must be kept in sync with src/clientversion.h , ugh! -define(_CLIENT_VERSION_MINOR, 9) -define(_CLIENT_VERSION_REVISION, 4) +define(_CLIENT_VERSION_MINOR, 10) +define(_CLIENT_VERSION_REVISION, 0) define(_CLIENT_VERSION_BUILD, 50) define(_ZC_BUILD_VAL, m4_if(m4_eval(_CLIENT_VERSION_BUILD < 25), 1, m4_incr(_CLIENT_VERSION_BUILD), m4_eval(_CLIENT_VERSION_BUILD < 50), 1, m4_eval(_CLIENT_VERSION_BUILD - 24), m4_eval(_CLIENT_VERSION_BUILD == 50), 1, , m4_eval(_CLIENT_VERSION_BUILD - 50))) define(_CLIENT_VERSION_SUFFIX, m4_if(m4_eval(_CLIENT_VERSION_BUILD < 25), 1, _CLIENT_VERSION_REVISION-beta$1, m4_eval(_CLIENT_VERSION_BUILD < 50), 1, _CLIENT_VERSION_REVISION-rc$1, m4_eval(_CLIENT_VERSION_BUILD == 50), 1, _CLIENT_VERSION_REVISION, _CLIENT_VERSION_REVISION-$1))) diff --git a/src/clientversion.h b/src/clientversion.h index 235661630..ff530f64c 100644 --- a/src/clientversion.h +++ b/src/clientversion.h @@ -29,8 +29,8 @@ //! These need to be macros, as clientversion.cpp's and bitcoin*-res.rc's voodoo requires it // Must be kept in sync with configure.ac , ugh! #define CLIENT_VERSION_MAJOR 3 -#define CLIENT_VERSION_MINOR 9 -#define CLIENT_VERSION_REVISION 4 +#define CLIENT_VERSION_MINOR 10 +#define CLIENT_VERSION_REVISION 0 #define CLIENT_VERSION_BUILD 50 //! Set to true for release, false for prerelease or test build From 94a48329addd2d1d028e6d183976a0ec1dbab24d Mon Sep 17 00:00:00 2001 From: Duke Date: Mon, 13 Nov 2023 10:22:04 -0500 Subject: [PATCH 15/15] Return help output if no address is given to z_listreceivedaddress --- src/wallet/rpcwallet.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/wallet/rpcwallet.cpp b/src/wallet/rpcwallet.cpp index fb4f24eb3..1442aaa92 100644 --- a/src/wallet/rpcwallet.cpp +++ b/src/wallet/rpcwallet.cpp @@ -3304,7 +3304,7 @@ UniValue z_listreceivedaddress(const UniValue& params, bool fHelp,const CPubKey& if (!EnsureWalletIsAvailable(fHelp)) return NullUniValue; - if (fHelp || params.size() > 5 || params.size() == 3) + if (fHelp || params.size() > 5 || params.size() < 1) throw runtime_error( "z_listreceivedaddress\n" "\nReturns received outputs.\n" @@ -3375,7 +3375,7 @@ UniValue z_listreceivedaddress(const UniValue& params, bool fHelp,const CPubKey& UniValue ret(UniValue::VARR); - //param values` + //param values int64_t nMinConfirms = 0; int64_t nFilterType = 0; int64_t nFilter = 999999;