From 6029b3d571009991ae9c4aea0397f4d00be6a817 Mon Sep 17 00:00:00 2001 From: Duke Date: Tue, 7 Nov 2023 09:09:48 -0500 Subject: [PATCH] Update to RandomX v1.2.1 Commit 102f8acf90a7649ada410de5499a7ec62e49e1da --- src/RandomX/CMakeLists.txt | 40 +- src/RandomX/README.md | 5 +- src/RandomX/doc/tevador.asc | 18 +- src/RandomX/src/allocator.cpp | 2 +- src/RandomX/src/assembly_generator_x86.cpp | 2 +- src/RandomX/src/bytecode_machine.cpp | 2 +- src/RandomX/src/common.hpp | 7 + src/RandomX/src/configuration.h | 10 +- src/RandomX/src/dataset.cpp | 2 +- src/RandomX/src/intrin_portable.h | 12 +- src/RandomX/src/jit_compiler.hpp | 42 +- src/RandomX/src/jit_compiler_a64.cpp | 74 +- src/RandomX/src/jit_compiler_a64.hpp | 2 +- src/RandomX/src/jit_compiler_a64_static.S | 98 +- src/RandomX/src/jit_compiler_rv64.cpp | 1175 ++++++++++++++++ src/RandomX/src/jit_compiler_rv64.hpp | 69 + src/RandomX/src/jit_compiler_rv64_static.S | 1235 +++++++++++++++++ src/RandomX/src/jit_compiler_rv64_static.hpp | 53 + src/RandomX/src/jit_compiler_x86.cpp | 4 +- src/RandomX/src/randomx.cpp | 28 + src/RandomX/src/randomx.h | 11 + src/RandomX/src/reciprocal.c | 34 +- src/RandomX/src/reciprocal.h | 4 +- src/RandomX/src/tests/benchmark.cpp | 32 +- src/RandomX/src/tests/perf-simulation.cpp | 2 +- src/RandomX/src/tests/riscv64_zba.s | 9 + src/RandomX/src/tests/riscv64_zbb.s | 9 + src/RandomX/src/tests/tests.cpp | 24 + .../{virtual_memory.cpp => virtual_memory.c} | 153 +- .../{virtual_memory.hpp => virtual_memory.h} | 26 +- src/RandomX/vcxproj/randomx-dll.vcxproj | 4 +- .../vcxproj/randomx-dll.vcxproj.filters | 4 +- src/RandomX/vcxproj/randomx.vcxproj | 4 +- src/RandomX/vcxproj/randomx.vcxproj.filters | 4 +- 34 files changed, 2966 insertions(+), 234 deletions(-) create mode 100644 src/RandomX/src/jit_compiler_rv64.cpp create mode 100644 src/RandomX/src/jit_compiler_rv64.hpp create mode 100644 src/RandomX/src/jit_compiler_rv64_static.S create mode 100644 src/RandomX/src/jit_compiler_rv64_static.hpp create mode 100644 src/RandomX/src/tests/riscv64_zba.s create mode 100644 src/RandomX/src/tests/riscv64_zbb.s rename src/RandomX/src/{virtual_memory.cpp => virtual_memory.c} (54%) rename src/RandomX/src/{virtual_memory.hpp => virtual_memory.h} (80%) diff --git a/src/RandomX/CMakeLists.txt b/src/RandomX/CMakeLists.txt index f41f606b9..ebbdff2b6 100644 --- a/src/RandomX/CMakeLists.txt +++ b/src/RandomX/CMakeLists.txt @@ -39,7 +39,7 @@ src/bytecode_machine.cpp src/cpu.cpp src/dataset.cpp src/soft_aes.cpp -src/virtual_memory.cpp +src/virtual_memory.c src/vm_interpreted.cpp src/allocator.cpp src/assembly_generator_x86.cpp @@ -96,7 +96,7 @@ function(add_flag flag) endfunction() # x86-64 -if(ARCH_ID STREQUAL "x86_64" OR ARCH_ID STREQUAL "x86-64" OR ARCH_ID STREQUAL "amd64") +if ((CMAKE_SIZEOF_VOID_P EQUAL 8) AND (ARCH_ID STREQUAL "x86_64" OR ARCH_ID STREQUAL "x86-64" OR ARCH_ID STREQUAL "amd64")) list(APPEND randomx_sources src/jit_compiler_x86.cpp) @@ -173,6 +173,42 @@ if(ARM_ID STREQUAL "aarch64" OR ARM_ID STREQUAL "arm64" OR ARM_ID STREQUAL "armv endif() endif() +# RISC-V +if(ARCH_ID STREQUAL "riscv64") + list(APPEND randomx_sources + src/jit_compiler_rv64_static.S + src/jit_compiler_rv64.cpp) + # cheat because cmake and ccache hate each other + set_property(SOURCE src/jit_compiler_rv64_static.S PROPERTY LANGUAGE C) + set_property(SOURCE src/jit_compiler_rv64_static.S PROPERTY XCODE_EXPLICIT_FILE_TYPE sourcecode.asm) + + # default build uses the RV64GC baseline + set(RVARCH "rv64gc") + + # for native builds, enable Zba and Zbb if supported by the CPU + if(ARCH STREQUAL "native") + enable_language(ASM) + try_run(RANDOMX_ZBA_RUN_FAIL + RANDOMX_ZBA_COMPILE_OK + ${CMAKE_CURRENT_BINARY_DIR}/ + ${CMAKE_CURRENT_SOURCE_DIR}/src/tests/riscv64_zba.s + COMPILE_DEFINITIONS "-march=rv64gc_zba") + if (RANDOMX_ZBA_COMPILE_OK AND NOT RANDOMX_ZBA_RUN_FAIL) + set(RVARCH "${RVARCH}_zba") + endif() + try_run(RANDOMX_ZBB_RUN_FAIL + RANDOMX_ZBB_COMPILE_OK + ${CMAKE_CURRENT_BINARY_DIR}/ + ${CMAKE_CURRENT_SOURCE_DIR}/src/tests/riscv64_zbb.s + COMPILE_DEFINITIONS "-march=rv64gc_zbb") + if (RANDOMX_ZBB_COMPILE_OK AND NOT RANDOMX_ZBB_RUN_FAIL) + set(RVARCH "${RVARCH}_zbb") + endif() + endif() + + add_flag("-march=${RVARCH}") +endif() + set(RANDOMX_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/src" CACHE STRING "RandomX Include path") add_library(randomx ${randomx_sources}) diff --git a/src/RandomX/README.md b/src/RandomX/README.md index 4c1dabb65..2c9bdd318 100644 --- a/src/RandomX/README.md +++ b/src/RandomX/README.md @@ -37,7 +37,7 @@ RandomX is written in C++11 and builds a static library with a C API provided by ### Linux -Build dependencies: `cmake` (minimum 2.8.7) and `gcc` (minimum version 4.8, but version 7+ is recommended). +Build dependencies: `cmake` (minimum 3.5) and `gcc` (minimum version 4.8, but version 7+ is recommended). To build optimized binaries for your machine, run: ``` @@ -82,7 +82,7 @@ Intel Core i7-8550U|16G DDR4-2400|Windows 10|hw|200 (4T)|1700 (4T)|350 (8T)| Intel Core i3-3220|4G DDR3-1333|Ubuntu 16.04|soft|42 (4T)|510 (4T)|150 (4T)| Raspberry Pi 3|1G LPDDR2|Ubuntu 16.04|soft|3.5 (4T)|-|20 (4T)| -Note that RandomX currently includes a JIT compiler for x86-64 and ARM64. Other architectures have to use the portable interpreter, which is much slower. +Note that RandomX currently includes a JIT compiler for x86-64, ARM64 and RISCV64. Other architectures have to use the portable interpreter, which is much slower. ### GPU performance @@ -129,6 +129,7 @@ The reference implementation has been validated on the following platforms: * ARMv7+VFPv3 (32-bit, little-endian) * ARMv8 (64-bit, little-endian) * PPC64 (64-bit, big-endian) +* RISCV64 (64-bit, little-endian) ### Can FPGAs mine RandomX? diff --git a/src/RandomX/doc/tevador.asc b/src/RandomX/doc/tevador.asc index b998f1ef2..8bada54bb 100644 --- a/src/RandomX/doc/tevador.asc +++ b/src/RandomX/doc/tevador.asc @@ -1,13 +1,13 @@ -----BEGIN PGP PUBLIC KEY BLOCK----- mDMEXd+PeBYJKwYBBAHaRw8BAQdAZ0nqJ+nRYoScG2QLX62pl+WO1+Mkv6Yyt2Kb -ntGUuLq0G3RldmFkb3IgPHRldmFkb3JAZ21haWwuY29tPoiWBBMWCAA+FiEEMoWj -LVEwdmMs6CUQWijIaue9c6YFAl3fj3gCGwMFCQWnqDgFCwkIBwIGFQoJCAsCBBYC -AwECHgECF4AACgkQWijIaue9c6YBFQD+N1XTUqSCZp9jB/yTHQ9ahSaIUMtmuvdT -So2s+quudP4A/R5wLwukpfGN9UZ4cfpmKCJ9jO1HJ2udmlGMsJbQpDAIuDgEXd+P +ntGUuLq0G3RldmFkb3IgPHRldmFkb3JAZ21haWwuY29tPoiWBBMWCAA+AhsDBQsJ +CAcCBhUKCQgLAgQWAgMBAh4BAheAFiEEMoWjLVEwdmMs6CUQWijIaue9c6YFAmRP +r8MFCQ/ZS2YACgkQWijIaue9c6bR5gEA0tnQ4Al+yOLoRUBQitAV8FU4FLy8Xx8U +IyyivjJ0UhIA/2jwJfMXmJdMKtar8xfIA5mZLLofkEP6hug4knhitpkBuDgEXd+P eBIKKwYBBAGXVQEFAQEHQBNbQuPcDojMCkRb5B5u7Ld/AFLClOh+6ElL+u61rIY/ -AwEIB4h+BBgWCAAmFiEEMoWjLVEwdmMs6CUQWijIaue9c6YFAl3fj3gCGwwFCQWn -qDgACgkQWijIaue9c6YJvgD+IY1Q9mCM1P1iZIoXuafRihXJ7UgVXpQqW2yoaUT3 -bfQA/RkisI2eElYoOjdwPszPP6VfL5+SViwDmDuJG2P5llgE -=V4vd ------END PGP PUBLIC KEY BLOCK----- +AwEIB4h+BBgWCAAmAhsMFiEEMoWjLVEwdmMs6CUQWijIaue9c6YFAmRQoAMFCQ/Z +S2YACgkQWijIaue9c6bUfwD9Hw20kGCaZ8rWghz9W3bc645ys1vPQpQW28CD9w3B +cTMBALsV1xpS2pGwTfn1PUimqESZfTrREmNvOjKSQwe0yicI +=D4lm +-----END PGP PUBLIC KEY BLOCK----- \ No newline at end of file diff --git a/src/RandomX/src/allocator.cpp b/src/RandomX/src/allocator.cpp index 6b48a7e70..bcee0f6b6 100644 --- a/src/RandomX/src/allocator.cpp +++ b/src/RandomX/src/allocator.cpp @@ -29,7 +29,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "allocator.hpp" #include "intrin_portable.h" -#include "virtual_memory.hpp" +#include "virtual_memory.h" #include "common.hpp" namespace randomx { diff --git a/src/RandomX/src/assembly_generator_x86.cpp b/src/RandomX/src/assembly_generator_x86.cpp index e7e5258b7..1ce31dd55 100644 --- a/src/RandomX/src/assembly_generator_x86.cpp +++ b/src/RandomX/src/assembly_generator_x86.cpp @@ -445,7 +445,7 @@ namespace randomx { } void AssemblyGeneratorX86::h_IMUL_RCP(Instruction& instr, int i) { - uint64_t divisor = instr.getImm32(); + const uint32_t divisor = instr.getImm32(); if (!isZeroOrPowerOf2(divisor)) { registerUsage[instr.dst] = i; asmCode << "\tmov rax, " << randomx_reciprocal(divisor) << std::endl; diff --git a/src/RandomX/src/bytecode_machine.cpp b/src/RandomX/src/bytecode_machine.cpp index 7d8e902d2..1d00d0959 100644 --- a/src/RandomX/src/bytecode_machine.cpp +++ b/src/RandomX/src/bytecode_machine.cpp @@ -243,7 +243,7 @@ namespace randomx { } if (opcode < ceil_IMUL_RCP) { - uint64_t divisor = instr.getImm32(); + const uint32_t divisor = instr.getImm32(); if (!isZeroOrPowerOf2(divisor)) { auto dst = instr.dst % RegistersCount; ibc.type = InstructionType::IMUL_R; diff --git a/src/RandomX/src/common.hpp b/src/RandomX/src/common.hpp index a77feb3bf..f4b85342a 100644 --- a/src/RandomX/src/common.hpp +++ b/src/RandomX/src/common.hpp @@ -116,12 +116,19 @@ namespace randomx { #if defined(_M_X64) || defined(__x86_64__) #define RANDOMX_HAVE_COMPILER 1 + #define RANDOMX_COMPILER_X86 class JitCompilerX86; using JitCompiler = JitCompilerX86; #elif defined(__aarch64__) #define RANDOMX_HAVE_COMPILER 1 + #define RANDOMX_COMPILER_A64 class JitCompilerA64; using JitCompiler = JitCompilerA64; +#elif defined(__riscv) && __riscv_xlen == 64 + #define RANDOMX_HAVE_COMPILER 1 + #define RANDOMX_COMPILER_RV64 + class JitCompilerRV64; + using JitCompiler = JitCompilerRV64; #else #define RANDOMX_HAVE_COMPILER 0 class JitCompilerFallback; diff --git a/src/RandomX/src/configuration.h b/src/RandomX/src/configuration.h index f74a74a4c..84400ddce 100644 --- a/src/RandomX/src/configuration.h +++ b/src/RandomX/src/configuration.h @@ -32,13 +32,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define RANDOMX_ARGON_MEMORY 262144 //Number of Argon2d iterations for Cache initialization. -#define RANDOMX_ARGON_ITERATIONS 5 +#define RANDOMX_ARGON_ITERATIONS 3 //Number of parallel lanes for Cache initialization. #define RANDOMX_ARGON_LANES 1 //Argon2d salt -#define RANDOMX_ARGON_SALT "RandomXHUSH\x03" +#define RANDOMX_ARGON_SALT "RandomX\x03" //Number of random Cache accesses per Dataset item. Minimum is 2. #define RANDOMX_CACHE_ACCESSES 8 @@ -53,13 +53,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define RANDOMX_DATASET_EXTRA_SIZE 33554368 //Number of instructions in a RandomX program. Must be divisible by 8. -#define RANDOMX_PROGRAM_SIZE 512 +#define RANDOMX_PROGRAM_SIZE 256 //Number of iterations during VM execution. -#define RANDOMX_PROGRAM_ITERATIONS 4096 +#define RANDOMX_PROGRAM_ITERATIONS 2048 //Number of chained VM executions per hash. -#define RANDOMX_PROGRAM_COUNT 16 +#define RANDOMX_PROGRAM_COUNT 8 //Scratchpad L3 size in bytes. Must be a power of 2. #define RANDOMX_SCRATCHPAD_L3 2097152 diff --git a/src/RandomX/src/dataset.cpp b/src/RandomX/src/dataset.cpp index 675c5abc5..7ebf1bca4 100644 --- a/src/RandomX/src/dataset.cpp +++ b/src/RandomX/src/dataset.cpp @@ -42,7 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.hpp" #include "dataset.hpp" -#include "virtual_memory.hpp" +#include "virtual_memory.h" #include "superscalar.hpp" #include "blake2_generator.hpp" #include "reciprocal.h" diff --git a/src/RandomX/src/intrin_portable.h b/src/RandomX/src/intrin_portable.h index 8c09ae885..50020c3e2 100644 --- a/src/RandomX/src/intrin_portable.h +++ b/src/RandomX/src/intrin_portable.h @@ -349,7 +349,7 @@ FORCE_INLINE rx_vec_i128 rx_load_vec_i128(rx_vec_i128 const *p) { #if defined(NATIVE_LITTLE_ENDIAN) return *p; #else - uint32_t* ptr = (uint32_t*)p; + const uint32_t* ptr = (const uint32_t*)p; vec_u c; c.u32[0] = load32(ptr + 0); c.u32[1] = load32(ptr + 1); @@ -375,8 +375,8 @@ FORCE_INLINE void rx_store_vec_i128(rx_vec_i128 *p, rx_vec_i128 b) { FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) { vec_u x; - x.d64[0] = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 0)); - x.d64[1] = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 4)); + x.d64[0] = (double)unsigned32ToSigned2sCompl(load32((const uint8_t*)addr + 0)); + x.d64[1] = (double)unsigned32ToSigned2sCompl(load32((const uint8_t*)addr + 4)); return (rx_vec_f128)x.d; } @@ -684,7 +684,7 @@ FORCE_INLINE rx_vec_i128 rx_load_vec_i128(rx_vec_i128 const* p) { #if defined(NATIVE_LITTLE_ENDIAN) return *p; #else - uint32_t* ptr = (uint32_t*)p; + const uint32_t* ptr = (const uint32_t*)p; rx_vec_i128 c; c.u32[0] = load32(ptr + 0); c.u32[1] = load32(ptr + 1); @@ -708,8 +708,8 @@ FORCE_INLINE void rx_store_vec_i128(rx_vec_i128 *p, rx_vec_i128 b) { FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) { rx_vec_f128 x; - x.lo = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 0)); - x.hi = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 4)); + x.lo = (double)unsigned32ToSigned2sCompl(load32((const uint8_t*)addr + 0)); + x.hi = (double)unsigned32ToSigned2sCompl(load32((const uint8_t*)addr + 4)); return x; } diff --git a/src/RandomX/src/jit_compiler.hpp b/src/RandomX/src/jit_compiler.hpp index 17fdad4e3..5b76fa5f9 100644 --- a/src/RandomX/src/jit_compiler.hpp +++ b/src/RandomX/src/jit_compiler.hpp @@ -28,10 +28,48 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma once -#if defined(_M_X64) || defined(__x86_64__) +#include "common.hpp" + +namespace randomx { + + struct CodeBuffer { + uint8_t* code; + int32_t codePos; + int32_t rcpCount; + + void emit(const uint8_t* src, int32_t len) { + memcpy(&code[codePos], src, len); + codePos += len; + } + + template + void emit(T src) { + memcpy(&code[codePos], &src, sizeof(src)); + codePos += sizeof(src); + } + + void emitAt(int32_t codePos, const uint8_t* src, int32_t len) { + memcpy(&code[codePos], src, len); + } + + template + void emitAt(int32_t codePos, T src) { + memcpy(&code[codePos], &src, sizeof(src)); + } + }; + + struct CompilerState : public CodeBuffer { + int32_t instructionOffsets[RANDOMX_PROGRAM_SIZE]; + int registerUsage[RegistersCount]; + }; +} + +#if defined(RANDOMX_COMPILER_X86) #include "jit_compiler_x86.hpp" -#elif defined(__aarch64__) +#elif defined(RANDOMX_COMPILER_A64) #include "jit_compiler_a64.hpp" +#elif defined(RANDOMX_COMPILER_RV64) +#include "jit_compiler_rv64.hpp" #else #include "jit_compiler_fallback.hpp" #endif diff --git a/src/RandomX/src/jit_compiler_a64.cpp b/src/RandomX/src/jit_compiler_a64.cpp index fc4634868..5be8f6e42 100644 --- a/src/RandomX/src/jit_compiler_a64.cpp +++ b/src/RandomX/src/jit_compiler_a64.cpp @@ -31,7 +31,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "superscalar.hpp" #include "program.hpp" #include "reciprocal.h" -#include "virtual_memory.hpp" +#include "virtual_memory.h" namespace ARMV8A { @@ -130,8 +130,8 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con // and w16, w10, ScratchpadL3Mask64 emit32(0x121A0000 | 16 | (10 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos); - // and w17, w18, ScratchpadL3Mask64 - emit32(0x121A0000 | 17 | (18 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos); + // and w17, w20, ScratchpadL3Mask64 + emit32(0x121A0000 | 17 | (20 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos); codePos = PrologueSize; literalPos = ImulRcpLiteralsEnd; @@ -149,16 +149,16 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con } // Update spMix2 - // eor w18, config.readReg2, config.readReg3 - emit32(ARMV8A::EOR32 | 18 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos); + // eor w20, config.readReg2, config.readReg3 + emit32(ARMV8A::EOR32 | 20 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos); // Jump back to the main loop const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64)) - codePos; emit32(ARMV8A::B | (offset / 4), code, codePos); - // and w18, w18, CacheLineAlignMask + // and w20, w20, CacheLineAlignMask codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask1) - ((uint8_t*)randomx_program_aarch64)); - emit32(0x121A0000 | 18 | (18 << 5) | ((Log2(RANDOMX_DATASET_BASE_SIZE) - 7) << 10), code, codePos); + emit32(0x121A0000 | 20 | (20 << 5) | ((Log2(RANDOMX_DATASET_BASE_SIZE) - 7) << 10), code, codePos); // and w10, w10, CacheLineAlignMask codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask2) - ((uint8_t*)randomx_program_aarch64)); @@ -181,8 +181,8 @@ void JitCompilerA64::generateProgramLight(Program& program, ProgramConfiguration // and w16, w10, ScratchpadL3Mask64 emit32(0x121A0000 | 16 | (10 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos); - // and w17, w18, ScratchpadL3Mask64 - emit32(0x121A0000 | 17 | (18 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos); + // and w17, w20, ScratchpadL3Mask64 + emit32(0x121A0000 | 17 | (20 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos); codePos = PrologueSize; literalPos = ImulRcpLiteralsEnd; @@ -200,8 +200,8 @@ void JitCompilerA64::generateProgramLight(Program& program, ProgramConfiguration } // Update spMix2 - // eor w18, config.readReg2, config.readReg3 - emit32(ARMV8A::EOR32 | 18 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos); + // eor w20, config.readReg2, config.readReg3 + emit32(ARMV8A::EOR32 | 20 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos); // Jump back to the main loop const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end_light) - ((uint8_t*)randomx_program_aarch64)) - codePos; @@ -434,7 +434,7 @@ void JitCompilerA64::emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm, } else { - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emitMovImmediate(tmp_reg, imm, code, k); // add dst, src, tmp_reg @@ -483,7 +483,7 @@ void JitCompilerA64::emitMemLoadFP(uint32_t src, Instruction& instr, uint8_t* co uint32_t k = codePos; uint32_t imm = instr.getImm32(); - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 19; imm &= instr.getModMem() ? (RANDOMX_SCRATCHPAD_L1 - 1) : (RANDOMX_SCRATCHPAD_L2 - 1); emitAddImmediate(tmp_reg, src, imm, code, k); @@ -537,7 +537,7 @@ void JitCompilerA64::h_IADD_M(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emitMemLoad(dst, src, instr, code, k); // add dst, dst, tmp_reg @@ -575,7 +575,7 @@ void JitCompilerA64::h_ISUB_M(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emitMemLoad(dst, src, instr, code, k); // sub dst, dst, tmp_reg @@ -594,7 +594,7 @@ void JitCompilerA64::h_IMUL_R(Instruction& instr, uint32_t& codePos) if (src == dst) { - src = 18; + src = 20; emitMovImmediate(src, instr.getImm32(), code, k); } @@ -612,7 +612,7 @@ void JitCompilerA64::h_IMUL_M(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emitMemLoad(dst, src, instr, code, k); // sub dst, dst, tmp_reg @@ -643,7 +643,7 @@ void JitCompilerA64::h_IMULH_M(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emitMemLoad(dst, src, instr, code, k); // umulh dst, dst, tmp_reg @@ -674,7 +674,7 @@ void JitCompilerA64::h_ISMULH_M(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emitMemLoad(dst, src, instr, code, k); // smulh dst, dst, tmp_reg @@ -686,34 +686,24 @@ void JitCompilerA64::h_ISMULH_M(Instruction& instr, uint32_t& codePos) void JitCompilerA64::h_IMUL_RCP(Instruction& instr, uint32_t& codePos) { - const uint64_t divisor = instr.getImm32(); + const uint32_t divisor = instr.getImm32(); if (isZeroOrPowerOf2(divisor)) return; uint32_t k = codePos; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint64_t N = 1ULL << 63; - const uint64_t q = N / divisor; - const uint64_t r = N % divisor; -#ifdef __GNUC__ - const uint64_t shift = 64 - __builtin_clzll(divisor); -#else - uint64_t shift = 32; - for (uint64_t k = 1U << 31; (k & divisor) == 0; k >>= 1) - --shift; -#endif - const uint32_t literal_id = (ImulRcpLiteralsEnd - literalPos) / sizeof(uint64_t); - literalPos -= sizeof(uint64_t); - *(uint64_t*)(code + literalPos) = (q << shift) + ((r << shift) / divisor); - if (literal_id < 13) + const uint64_t reciprocal = randomx_reciprocal_fast(divisor); + memcpy(code + literalPos, &reciprocal, sizeof(reciprocal)); + + if (literal_id < 12) { - static constexpr uint32_t literal_regs[13] = { 30 << 16, 29 << 16, 28 << 16, 27 << 16, 26 << 16, 25 << 16, 24 << 16, 23 << 16, 22 << 16, 21 << 16, 20 << 16, 11 << 16, 0 }; + static constexpr uint32_t literal_regs[12] = { 30 << 16, 29 << 16, 28 << 16, 27 << 16, 26 << 16, 25 << 16, 24 << 16, 23 << 16, 22 << 16, 21 << 16, 11 << 16, 0 }; // mul dst, dst, literal_reg emit32(ARMV8A::MUL | dst | (dst << 5) | literal_regs[literal_id], code, k); @@ -751,7 +741,7 @@ void JitCompilerA64::h_IXOR_R(Instruction& instr, uint32_t& codePos) if (src == dst) { - src = 18; + src = 20; emitMovImmediate(src, instr.getImm32(), code, k); } @@ -769,7 +759,7 @@ void JitCompilerA64::h_IXOR_M(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emitMemLoad(dst, src, instr, code, k); // eor dst, dst, tmp_reg @@ -807,7 +797,7 @@ void JitCompilerA64::h_IROL_R(Instruction& instr, uint32_t& codePos) if (src != dst) { - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; // sub tmp_reg, xzr, src emit32(ARMV8A::SUB | tmp_reg | (31 << 5) | (src << 16), code, k); @@ -835,7 +825,7 @@ void JitCompilerA64::h_ISWAP_R(Instruction& instr, uint32_t& codePos) uint32_t k = codePos; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emit32(ARMV8A::MOV_REG | tmp_reg | (dst << 16), code, k); emit32(ARMV8A::MOV_REG | dst | (src << 16), code, k); emit32(ARMV8A::MOV_REG | src | (tmp_reg << 16), code, k); @@ -984,7 +974,7 @@ void JitCompilerA64::h_CFROUND(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; constexpr uint32_t fpcr_tmp_reg = 8; // ror tmp_reg, src, imm @@ -1008,7 +998,7 @@ void JitCompilerA64::h_ISTORE(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; uint32_t imm = instr.getImm32(); diff --git a/src/RandomX/src/jit_compiler_a64.hpp b/src/RandomX/src/jit_compiler_a64.hpp index a02824ffb..f8484c083 100644 --- a/src/RandomX/src/jit_compiler_a64.hpp +++ b/src/RandomX/src/jit_compiler_a64.hpp @@ -81,7 +81,7 @@ namespace randomx { static void emit64(uint64_t val, uint8_t* code, uint32_t& codePos) { - *(uint64_t*)(code + codePos) = val; + memcpy(code + codePos, &val, sizeof(val)); codePos += sizeof(val); } diff --git a/src/RandomX/src/jit_compiler_a64_static.S b/src/RandomX/src/jit_compiler_a64_static.S index 4886fcf3c..bc146133a 100644 --- a/src/RandomX/src/jit_compiler_a64_static.S +++ b/src/RandomX/src/jit_compiler_a64_static.S @@ -74,9 +74,9 @@ # x15 -> "r7" # x16 -> spAddr0 # x17 -> spAddr1 -# x18 -> temporary +# x18 -> unused (platform register, don't touch it) # x19 -> temporary -# x20 -> literal for IMUL_RCP +# x20 -> temporary # x21 -> literal for IMUL_RCP # x22 -> literal for IMUL_RCP # x23 -> literal for IMUL_RCP @@ -111,7 +111,7 @@ DECL(randomx_program_aarch64): # Save callee-saved registers sub sp, sp, 192 stp x16, x17, [sp] - stp x18, x19, [sp, 16] + str x19, [sp, 16] stp x20, x21, [sp, 32] stp x22, x23, [sp, 48] stp x24, x25, [sp, 64] @@ -166,7 +166,6 @@ DECL(randomx_program_aarch64): # Read literals ldr x0, literal_x0 ldr x11, literal_x11 - ldr x20, literal_x20 ldr x21, literal_x21 ldr x22, literal_x22 ldr x23, literal_x23 @@ -198,11 +197,11 @@ DECL(randomx_program_aarch64): DECL(randomx_program_aarch64_main_loop): # spAddr0 = spMix1 & ScratchpadL3Mask64; # spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64; - lsr x18, x10, 32 + lsr x20, x10, 32 # Actual mask will be inserted by JIT compiler and w16, w10, 1 - and w17, w18, 1 + and w17, w20, 1 # x16 = scratchpad + spAddr0 # x17 = scratchpad + spAddr1 @@ -210,31 +209,31 @@ DECL(randomx_program_aarch64_main_loop): add x17, x17, x2 # xor integer registers with scratchpad data (spAddr0) - ldp x18, x19, [x16] - eor x4, x4, x18 + ldp x20, x19, [x16] + eor x4, x4, x20 eor x5, x5, x19 - ldp x18, x19, [x16, 16] - eor x6, x6, x18 + ldp x20, x19, [x16, 16] + eor x6, x6, x20 eor x7, x7, x19 - ldp x18, x19, [x16, 32] - eor x12, x12, x18 + ldp x20, x19, [x16, 32] + eor x12, x12, x20 eor x13, x13, x19 - ldp x18, x19, [x16, 48] - eor x14, x14, x18 + ldp x20, x19, [x16, 48] + eor x14, x14, x20 eor x15, x15, x19 # Load group F registers (spAddr1) - ldpsw x18, x19, [x17] - ins v16.d[0], x18 + ldpsw x20, x19, [x17] + ins v16.d[0], x20 ins v16.d[1], x19 - ldpsw x18, x19, [x17, 8] - ins v17.d[0], x18 + ldpsw x20, x19, [x17, 8] + ins v17.d[0], x20 ins v17.d[1], x19 - ldpsw x18, x19, [x17, 16] - ins v18.d[0], x18 + ldpsw x20, x19, [x17, 16] + ins v18.d[0], x20 ins v18.d[1], x19 - ldpsw x18, x19, [x17, 24] - ins v19.d[0], x18 + ldpsw x20, x19, [x17, 24] + ins v19.d[0], x20 ins v19.d[1], x19 scvtf v16.2d, v16.2d scvtf v17.2d, v17.2d @@ -242,17 +241,17 @@ DECL(randomx_program_aarch64_main_loop): scvtf v19.2d, v19.2d # Load group E registers (spAddr1) - ldpsw x18, x19, [x17, 32] - ins v20.d[0], x18 + ldpsw x20, x19, [x17, 32] + ins v20.d[0], x20 ins v20.d[1], x19 - ldpsw x18, x19, [x17, 40] - ins v21.d[0], x18 + ldpsw x20, x19, [x17, 40] + ins v21.d[0], x20 ins v21.d[1], x19 - ldpsw x18, x19, [x17, 48] - ins v22.d[0], x18 + ldpsw x20, x19, [x17, 48] + ins v22.d[0], x20 ins v22.d[1], x19 - ldpsw x18, x19, [x17, 56] - ins v23.d[0], x18 + ldpsw x20, x19, [x17, 56] + ins v23.d[0], x20 ins v23.d[1], x19 scvtf v20.2d, v20.2d scvtf v21.2d, v21.2d @@ -276,7 +275,6 @@ DECL(randomx_program_aarch64_vm_instructions): literal_x0: .fill 1,8,0 literal_x11: .fill 1,8,0 -literal_x20: .fill 1,8,0 literal_x21: .fill 1,8,0 literal_x22: .fill 1,8,0 literal_x23: .fill 1,8,0 @@ -312,17 +310,17 @@ DECL(randomx_program_aarch64_vm_instructions_end): lsr x10, x9, 32 # mx ^= r[readReg2] ^ r[readReg3]; - eor x9, x9, x18 + eor x9, x9, x20 # Calculate dataset pointer for dataset prefetch - mov w18, w9 + mov w20, w9 DECL(randomx_program_aarch64_cacheline_align_mask1): # Actual mask will be inserted by JIT compiler - and x18, x18, 1 - add x18, x18, x1 + and x20, x20, 1 + add x20, x20, x1 # Prefetch dataset data - prfm pldl2strm, [x18] + prfm pldl2strm, [x20] # mx <-> ma ror x9, x9, 32 @@ -335,17 +333,17 @@ DECL(randomx_program_aarch64_cacheline_align_mask2): DECL(randomx_program_aarch64_xor_with_dataset_line): rx_program_xor_with_dataset_line: # xor integer registers with dataset data - ldp x18, x19, [x10] - eor x4, x4, x18 + ldp x20, x19, [x10] + eor x4, x4, x20 eor x5, x5, x19 - ldp x18, x19, [x10, 16] - eor x6, x6, x18 + ldp x20, x19, [x10, 16] + eor x6, x6, x20 eor x7, x7, x19 - ldp x18, x19, [x10, 32] - eor x12, x12, x18 + ldp x20, x19, [x10, 32] + eor x12, x12, x20 eor x13, x13, x19 - ldp x18, x19, [x10, 48] - eor x14, x14, x18 + ldp x20, x19, [x10, 48] + eor x14, x14, x20 eor x15, x15, x19 DECL(randomx_program_aarch64_update_spMix1): @@ -388,7 +386,7 @@ DECL(randomx_program_aarch64_update_spMix1): # Restore callee-saved registers ldp x16, x17, [sp] - ldp x18, x19, [sp, 16] + ldr x19, [sp, 16] ldp x20, x21, [sp, 32] ldp x22, x23, [sp, 48] ldp x24, x25, [sp, 64] @@ -409,7 +407,7 @@ DECL(randomx_program_aarch64_vm_instructions_end_light): stp x2, x30, [sp, 80] # mx ^= r[readReg2] ^ r[readReg3]; - eor x9, x9, x18 + eor x9, x9, x20 # mx <-> ma ror x9, x9, 32 @@ -451,8 +449,8 @@ DECL(randomx_program_aarch64_light_dataset_offset): # x3 -> end item DECL(randomx_init_dataset_aarch64): - # Save x30 (return address) - str x30, [sp, -16]! + # Save x20 (used as temporary, but must be saved to not break ABI) and x30 (return address) + stp x20, x30, [sp, -16]! # Load pointer to cache memory ldr x0, [x0] @@ -464,8 +462,8 @@ DECL(randomx_init_dataset_aarch64_main_loop): cmp x2, x3 bne DECL(randomx_init_dataset_aarch64_main_loop) - # Restore x30 (return address) - ldr x30, [sp], 16 + # Restore x20 and x30 + ldp x20, x30, [sp], 16 ret diff --git a/src/RandomX/src/jit_compiler_rv64.cpp b/src/RandomX/src/jit_compiler_rv64.cpp new file mode 100644 index 000000000..6f0842e5f --- /dev/null +++ b/src/RandomX/src/jit_compiler_rv64.cpp @@ -0,0 +1,1175 @@ +/* +Copyright (c) 2023 tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include +#include +#include "jit_compiler_rv64.hpp" +#include "jit_compiler_rv64_static.hpp" +#include "superscalar.hpp" +#include "program.hpp" +#include "reciprocal.h" +#include "virtual_memory.h" + + +namespace { +#define HANDLER_ARGS randomx::CompilerState& state, randomx::Instruction isn, int i + using InstructionHandler = void(HANDLER_ARGS); + extern InstructionHandler* opcodeMap1[256]; +} + +namespace rv64 { + constexpr uint16_t C_LUI = 0x6001; + constexpr uint32_t LUI = 0x00000037; + constexpr uint16_t C_ADDI = 0x0001; + constexpr uint32_t ADDI = 0x00000013; + constexpr uint32_t ADDIW = 0x0000001b; + constexpr uint16_t C_ADD = 0x9002; + constexpr uint32_t ADD = 0x00000033; + constexpr uint32_t SHXADD = 0x20000033; //Zba + constexpr uint32_t SLL = 0x00001033; + constexpr uint32_t SRL = 0x00005033; + constexpr uint32_t SLLI = 0x00001013; + constexpr uint32_t C_SLLI = 0x0002; + constexpr uint32_t SRLI = 0x00005013; + constexpr uint32_t AND = 0x00007033; + constexpr uint32_t ANDI = 0x00007013; + constexpr uint16_t C_AND = 0x8c61; + constexpr uint16_t C_ANDI = 0x8801; + constexpr uint32_t OR = 0x00006033; + constexpr uint16_t C_OR = 0x8c41; + constexpr uint32_t XOR = 0x00004033; + constexpr uint16_t C_XOR = 0x8c21; + constexpr uint32_t LD = 0x00003003; + constexpr uint16_t C_LD = 0x6000; + constexpr uint16_t C_LW = 0x4000; + constexpr uint32_t SD = 0x00003023; + constexpr uint32_t SUB = 0x40000033; + constexpr uint16_t C_SUB = 0x8c01; + constexpr uint32_t MUL = 0x02000033; + constexpr uint32_t MULHU = 0x02003033; + constexpr uint32_t MULH = 0x02001033; + constexpr uint16_t C_MV = 0x8002; + constexpr uint32_t ROR = 0x60005033; //Zbb + constexpr uint32_t RORI = 0x60005013; //Zbb + constexpr uint32_t ROL = 0x60001033; //Zbb + constexpr uint32_t FMV_X_D = 0xe2000053; + constexpr uint32_t FMV_D_X = 0xf2000053; + constexpr uint32_t FMV_D = 0x22000053; + constexpr uint32_t FADD_D = 0x02007053; + constexpr uint32_t FSUB_D = 0x0a007053; + constexpr uint32_t FMUL_D = 0x12007053; + constexpr uint32_t FDIV_D = 0x1a007053; + constexpr uint32_t FSQRT_D = 0x5a007053; + constexpr uint32_t FCVT_D_W = 0xd2000053; + constexpr uint32_t FSRM = 0x00201073; + constexpr uint16_t C_BEQZ = 0xc001; + constexpr uint32_t BEQ = 0x00000063; + constexpr uint16_t C_BNEZ = 0xe001; + constexpr uint32_t JAL = 0x0000006f; + constexpr uint16_t C_RET = 0x8082; +} + +namespace randomx { + + constexpr size_t MaxRandomXInstrCodeSize = 56; //FDIV_M requires 56 bytes of rv64 code + constexpr size_t MaxSuperscalarInstrSize = 12; //IXOR_C requires 12 bytes of rv64 code + constexpr size_t SuperscalarProgramHeader = 136; //overhead per superscalar program + constexpr size_t CodeAlign = 4096; //align code size to a multiple of 4 KiB + constexpr size_t LiteralPoolSize = CodeAlign; + constexpr size_t SuperscalarLiteraPoolSize = RANDOMX_CACHE_ACCESSES * CodeAlign; + constexpr size_t ReserveCodeSize = CodeAlign; //prologue, epilogue + reserve + + constexpr size_t RandomXCodeSize = alignSize(LiteralPoolSize + ReserveCodeSize + MaxRandomXInstrCodeSize * RANDOMX_PROGRAM_SIZE, CodeAlign); + constexpr size_t SuperscalarSize = alignSize(SuperscalarLiteraPoolSize + ReserveCodeSize + (SuperscalarProgramHeader + MaxSuperscalarInstrSize * SuperscalarMaxSize) * RANDOMX_CACHE_ACCESSES, CodeAlign); + + static_assert(RandomXCodeSize < INT32_MAX / 2, "RandomXCodeSize is too large"); + static_assert(SuperscalarSize < INT32_MAX / 2, "SuperscalarSize is too large"); + + constexpr uint32_t CodeSize = RandomXCodeSize + SuperscalarSize; + constexpr uint32_t ExecutableSize = CodeSize - LiteralPoolSize; + + constexpr int32_t LiteralPoolOffset = LiteralPoolSize / 2; + constexpr int32_t SuperScalarLiteralPoolOffset = RandomXCodeSize; + constexpr int32_t SuperScalarLiteralPoolRefOffset = RandomXCodeSize + (RANDOMX_CACHE_ACCESSES - 1) * LiteralPoolSize + LiteralPoolOffset; + constexpr int32_t SuperScalarHashOffset = SuperScalarLiteralPoolOffset + SuperscalarLiteraPoolSize; + + constexpr int maskLog2(uint32_t x, int prev) { + return x == 1 ? prev : maskLog2(x >> 1, prev + 1); + } + + constexpr int32_t unsigned32ToSigned2sCompl(uint32_t x) { + return (-1 == ~0) ? (int32_t)x : (x > INT32_MAX ? (-(int32_t)(UINT32_MAX - x) - 1) : (int32_t)x); + } + + constexpr int MaskL1Shift = 32 - maskLog2(RANDOMX_SCRATCHPAD_L1, 0); + constexpr int MaskL2Shift = 32 - maskLog2(RANDOMX_SCRATCHPAD_L2, 0); + constexpr int MaskL3Shift = 32 - maskLog2(RANDOMX_SCRATCHPAD_L3, 0); + + constexpr int RcpLiteralsOffset = 144; + + constexpr int LiteralPoolReg = 3; //x3 + constexpr int SpadReg = 5; //x5 + constexpr int DataReg = 6; //x6 + constexpr int SuperscalarReg = 7; //x7 + constexpr int SshTmp1Reg = 28; //x28 + constexpr int SshTmp2Reg = 29; //x29 + constexpr int SshPoolReg = 30; //x30 + constexpr int SshRcpReg = 31; //x31 + constexpr int Tmp1Reg = 8; //x8 + constexpr int Tmp2Reg = 9; //x9 + constexpr int Tmp1RegF = 24; //f24 + constexpr int Tmp2RegF = 25; //f25 + constexpr int MaskL1Reg = 10; //x10 + constexpr int MaskL2Reg = 11; //x11 + constexpr int MaskFscalReg = 12; //x12 + constexpr int MaskEclear = 13; //x13 + constexpr int MaskEsetLo = 14; //x14 + constexpr int MaskEsetHi = 15; //x15 + constexpr int MaskL3Reg = 1; //x1 + constexpr int ReturnReg = 1; //x1 + constexpr int SpAddr0Reg = 26; //x26 + constexpr int OffsetXC = -8; //x8-x15 + constexpr int OffsetR = 16; //x16-x23 + constexpr int OffsetF = 0; //f0-f7 + constexpr int OffsetE = 8; //f8-f15 + constexpr int OffsetA = 16; //f16-f23 + constexpr int OffsetRcp = 28; //x28-x31 + constexpr int OffsetRcpF = 22; //f26-f31 + constexpr int OffsetSsh = 8; //x8-x15 + + //destination register (bit 7+) + constexpr int rvrd(int reg) { + return reg << 7; + } + + //first source register (bit 15+) + constexpr int rvrs1(int reg) { + return reg << 15; + } + + //second source register (bit 20+) + constexpr int rvrs2(int reg) { + return reg << 20; + } + + //compressed source register (bit 2+) + constexpr int rvcrs(int reg) { + return reg << 2; + } + + //base instruction: {op} x{rd}, x{rs1}, x{rs2} + constexpr uint32_t rvi(uint32_t op, int rd, int rs1, int rs2 = 0) { + return op | rvrs2(rs2) | rvrs1(rs1) | rvrd(rd); + } + + //compressed instruction: op x{rd}, x{rs} + constexpr uint16_t rvc(uint16_t op, int rd, int rs) { + return op | rvrd(rd) | rvcrs(rs); + } + + //compressed instruction: op x{rd}, imm6 + constexpr uint16_t rvc(uint16_t op, int imm5, int rd, int imm40) { + return op | (imm5 << 12) | rvrd(rd) | (imm40 << 2); + } + + constexpr int regR(int reg) { + return reg + OffsetR; + } + + constexpr int regLoA(int reg) { + return 2 * reg + OffsetA; + } + + constexpr int regHiA(int reg) { + return 2 * reg + OffsetA + 1; + } + + constexpr int regLoF(int reg) { + return 2 * reg + OffsetF; + } + + constexpr int regHiF(int reg) { + return 2 * reg + OffsetF + 1; + } + + constexpr int regLoE(int reg) { + return 2 * reg + OffsetE; + } + + constexpr int regHiE(int reg) { + return 2 * reg + OffsetE + 1; + } + + constexpr int regRcp(int reg) { + return reg + OffsetRcp; + } + + constexpr int regRcpF(int reg) { + return reg + OffsetRcpF; + } + + constexpr int regSS(int reg) { + return reg + OffsetSsh; + } + + static const uint8_t* codeLiterals = (uint8_t*)&randomx_riscv64_literals; + static const uint8_t* codeLiteralsEnd = (uint8_t*)&randomx_riscv64_literals_end; + static const uint8_t* codeDataInit = (uint8_t*)&randomx_riscv64_data_init; + static const uint8_t* codeFixDataCall = (uint8_t*)&randomx_riscv64_fix_data_call; + static const uint8_t* codePrologue = (uint8_t*)&randomx_riscv64_prologue; + static const uint8_t* codeLoopBegin = (uint8_t*)&randomx_riscv64_loop_begin; + static const uint8_t* codeDataRead = (uint8_t*)&randomx_riscv64_data_read; + static const uint8_t* codeDataReadLight = (uint8_t*)&randomx_riscv64_data_read_light; + static const uint8_t* codeFixLoopCall = (uint8_t*)&randomx_riscv64_fix_loop_call; + static const uint8_t* codeSpadStore = (uint8_t*)&randomx_riscv64_spad_store; + static const uint8_t* codeSpadStoreHardAes = (uint8_t*)&randomx_riscv64_spad_store_hardaes; + static const uint8_t* codeSpadStoreSoftAes = (uint8_t*)&randomx_riscv64_spad_store_softaes; + static const uint8_t* codeLoopEnd = (uint8_t*)&randomx_riscv64_loop_end; + static const uint8_t* codeFixContinueLoop = (uint8_t*)&randomx_riscv64_fix_continue_loop; + static const uint8_t* codeEpilogue = (uint8_t*)&randomx_riscv64_epilogue; + static const uint8_t* codeSoftAes = (uint8_t*)&randomx_riscv64_softaes; + static const uint8_t* codeProgramEnd = (uint8_t*)&randomx_riscv64_program_end; + static const uint8_t* codeSshInit = (uint8_t*)&randomx_riscv64_ssh_init; + static const uint8_t* codeSshLoad = (uint8_t*)&randomx_riscv64_ssh_load; + static const uint8_t* codeSshPrefetch = (uint8_t*)&randomx_riscv64_ssh_prefetch; + static const uint8_t* codeSshEnd = (uint8_t*)&randomx_riscv64_ssh_end; + + static const int32_t sizeLiterals = codeLiteralsEnd - codeLiterals; + static const int32_t sizeDataInit = codePrologue - codeDataInit; + static const int32_t sizePrologue = codeLoopBegin - codePrologue; + static const int32_t sizeLoopBegin = codeDataRead - codeLoopBegin; + static const int32_t sizeDataRead = codeDataReadLight - codeDataRead; + static const int32_t sizeDataReadLight = codeSpadStore - codeDataReadLight; + static const int32_t sizeSpadStore = codeSpadStoreHardAes - codeSpadStore; + static const int32_t sizeSpadStoreSoftAes = codeLoopEnd - codeSpadStoreSoftAes; + static const int32_t sizeLoopEnd = codeEpilogue - codeLoopEnd; + static const int32_t sizeEpilogue = codeSoftAes - codeEpilogue; + static const int32_t sizeSoftAes = codeProgramEnd - codeSoftAes; + static const int32_t sizeSshInit = codeSshLoad - codeSshInit; + static const int32_t sizeSshLoad = codeSshPrefetch - codeSshLoad; + static const int32_t sizeSshPrefetch = codeSshEnd - codeSshPrefetch; + + static const int32_t offsetFixDataCall = codeFixDataCall - codeDataInit; + static const int32_t offsetFixLoopCall = codeFixLoopCall - codeDataReadLight; + static const int32_t offsetFixContinueLoop = codeFixContinueLoop - codeLoopEnd; + + static const int32_t LoopTopPos = LiteralPoolSize + sizeDataInit + sizePrologue; + static const int32_t RandomXCodePos = LoopTopPos + sizeLoopBegin; + + static void clearCache(CodeBuffer& buf) { +#ifdef __GNUC__ + __builtin___clear_cache((char*)buf.code, (char*)(buf.code + CodeSize)); +#endif + } + + //emits code to calculate: x{dst} = x{src} + {imm32} + //takes 1-3 isns, 2-10 bytes + static void emitImm32(CodeBuffer& buf, int32_t imm, int dst, int src = 0, int tmp = 0) { + + //lower 12 bits + int32_t limm = (imm << 20) >> 20; + //upper 20 bits + int32_t uimm = (imm >> 12) + (limm < 0); + + //If there are no upper bits, the whole thing + //can be done with a single instruction. + if (uimm == 0) { + //addi x{dst}, x{src}, {limm} + buf.emit(rvi(rv64::ADDI, dst, src, limm)); + return; + } + + //dst1 is the register where imm will be materialized + int dst1 = src != dst ? dst : tmp; + assert(dst1 != 0); + //src1 is the register that will be added to the result + int src1 = src != dst ? src : dst1; + + //load upper bits + if (uimm >= -32 && uimm <= 31) { + //c.lui x{dst1}, {uimm} + buf.emit(rvc(rv64::C_LUI, (uimm < 0), dst1, (uimm & 31))); + } + else { + //lui x{dst1}, {uimm} + buf.emit(rv64::LUI | (uimm << 12) | rvrd(dst1)); + } + //load lower bits + if (limm != 0) { + //Note: this must be addiw NOT addi, otherwise the upper 32 bits + //of the 64-bit register will be incorrect. + //addiw x{dst1}, x{dst1}, {limm} + buf.emit(rvi(rv64::ADDIW, dst1, dst1, limm)); + } + //add src + if (src1 != 0) { + //c.add x{dst}, x{src1} + buf.emit(rvc(rv64::C_ADD, dst, src1)); + } + } + + //x9 = &Scratchpad[isn.imm] + //takes 3 isns, 10 bytes + static void genAddressRegImm(CodeBuffer& buf, const Instruction& isn) { + //signed offset 8-byte aligned + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()) & ScratchpadL3Mask; + //x9 = x5 + {imm} + emitImm32(buf, imm, Tmp2Reg, SpadReg, Tmp1Reg); + } + + //x9 = &Scratchpad[isn.src + isn.imm] (for reading) + //takes 5 isns, 12 bytes + static void genAddressReg(CodeBuffer& buf, const Instruction& isn) { + int shift, maskReg; + if (isn.getModMem()) { + shift = MaskL1Shift; + maskReg = MaskL1Reg; + } + else { + shift = MaskL2Shift; + maskReg = MaskL2Reg; + } + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + imm = (imm << shift) >> shift; + //x9 = x{src} + {imm} + emitImm32(buf, imm, Tmp2Reg, regR(isn.src), Tmp1Reg); + //c.and x9, x{maskReg} + buf.emit(rvc(rv64::C_AND, (Tmp2Reg + OffsetXC), (maskReg + OffsetXC))); + //c.add x9, x{spadReg} + buf.emit(rvc(rv64::C_ADD, Tmp2Reg, SpadReg)); + } + + //x8 = Scratchpad[isn] + static void loadFromScratchpad(CodeBuffer& buf, const Instruction& isn) { + if (isn.src != isn.dst) { + //x9 = &Scratchpad[isn.src + isn.imm] + genAddressReg(buf, isn); + } + else { + ///x9 = &Scratchpad[isn.imm] + genAddressRegImm(buf, isn); + } + //c.ld x8, 0(x9) + buf.emit(rvc(rv64::C_LD, Tmp2Reg + OffsetXC, Tmp1Reg + OffsetXC)); + } + + //x9 = &Scratchpad[isn.dst + isn.imm32] (for writing) + //takes 5 isns, 12-16 bytes + static void genAddressRegDst(CodeBuffer& buf, const Instruction& isn) { + if (isn.getModCond() < StoreL3Condition) { + int shift, maskReg; + if (isn.getModMem()) { + shift = MaskL1Shift; + maskReg = MaskL1Reg; + } + else { + shift = MaskL2Shift; + maskReg = MaskL2Reg; + } + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + imm = (imm << shift) >> shift; + //x9 = x{dst} + {imm} + emitImm32(buf, imm, Tmp2Reg, regR(isn.dst), Tmp1Reg); + //c.and x9, x{maskReg} + buf.emit(rvc(rv64::C_AND, Tmp2Reg + OffsetXC, maskReg + OffsetXC)); + //c.add x9, x5 + buf.emit(rvc(rv64::C_ADD, Tmp2Reg, SpadReg)); + } + else { + int shift = MaskL3Shift; + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + imm = (imm << shift) >> shift; + //x9 = x{dst} + {imm} + emitImm32(buf, imm, Tmp2Reg, regR(isn.dst), Tmp1Reg); + //and x9, x9, x1 + buf.emit(rvi(rv64::AND, Tmp2Reg, Tmp2Reg, MaskL3Reg)); + //c.add x9, x5 + buf.emit(rvc(rv64::C_ADD, Tmp2Reg, SpadReg)); + } + } + + static void emitRcpLiteral1(CodeBuffer& buf, uint64_t literal) { + //first 238 at positive offsets + if (buf.rcpCount < 238) { + buf.emitAt(LiteralPoolOffset + RcpLiteralsOffset + buf.rcpCount * 8, literal); + buf.rcpCount++; + } + //next 256 at negative offsets + else if (buf.rcpCount < 494) { + buf.emitAt(buf.rcpCount * 8 - (2048 - RcpLiteralsOffset), literal); + buf.rcpCount++; + } + else { + //checked at compile time, but double-check here + throw std::runtime_error("Literal pool overflow"); + } + } + + static void emitRcpLiteral2(CodeBuffer& buf, uint64_t literal, int32_t numLiterals) { + //store the current literal in the pool + int32_t offset = 2040 - buf.rcpCount * 8; + buf.emitAt(SuperScalarLiteralPoolRefOffset + offset, literal); + buf.rcpCount++; + if (buf.rcpCount >= numLiterals) { + return; + } + //load the next literal + offset -= 8; + int32_t imm = offset & 0xfff; + //ld x31, {offset}(x30) + buf.emit(rvi(rv64::LD, SshRcpReg, SshPoolReg, imm)); + if (imm == 0x800) { + //move pool pointer back 4KB + //c.lui x29, 0xfffff + buf.emit(rvc(rv64::C_LUI, 1, SshTmp2Reg, 31)); + //c.add x30, x29 + buf.emit(rvc(rv64::C_ADD, SshPoolReg, SshTmp2Reg)); + } + } + + static void emitJump(CodeBuffer& buf, int dst, int32_t codePos, int32_t targetPos) { + int32_t imm = targetPos - codePos; + int32_t imm20 = (imm < 0) << 11; + int32_t imm1912 = (imm >> 7) & 8160; + int32_t imm11 = (imm >> 11) & 1; + int32_t imm101 = imm & 2046; + //jal x{dst}, {imm} + buf.emitAt(codePos, rvi(rv64::JAL, dst + imm1912, 0, imm20 + imm101 + imm11)); + } + + static void emitInstruction(CompilerState& state, Instruction isn, int i) { + state.instructionOffsets[i] = state.codePos; + opcodeMap1[isn.opcode](state, isn, i); + } + + static void emitProgramPrefix(CompilerState& state, Program& prog, ProgramConfiguration& pcfg) { + state.codePos = RandomXCodePos; + state.rcpCount = 0; + state.emitAt(LiteralPoolOffset + sizeLiterals, pcfg.eMask[0]); + state.emitAt(LiteralPoolOffset + sizeLiterals + 8, pcfg.eMask[1]); + for (unsigned i = 0; i < RegistersCount; ++i) { + state.registerUsage[i] = -1; + } + for (unsigned i = 0; i < prog.getSize(); ++i) { + Instruction instr = prog(i); + instr.src %= RegistersCount; + instr.dst %= RegistersCount; + emitInstruction(state, instr, i); + } + } + + static void emitProgramSuffix(CompilerState& state, ProgramConfiguration& pcfg) { + state.emit(codeSpadStore, sizeSpadStore); + int32_t fixPos = state.codePos; + state.emit(codeLoopEnd, sizeLoopEnd); + //xor x26, x{readReg0}, x{readReg1} + state.emitAt(fixPos, rvi(rv64::XOR, SpAddr0Reg, regR(pcfg.readReg0), regR(pcfg.readReg1))); + fixPos += offsetFixContinueLoop; + //j LoopTop + emitJump(state, 0, fixPos, LoopTopPos); + state.emit(codeEpilogue, sizeEpilogue); + } + + static void generateSuperscalarCode(CodeBuffer& buf, Instruction isn, const std::vector& reciprocalCache) { + switch ((SuperscalarInstructionType)isn.opcode) + { + case randomx::SuperscalarInstructionType::ISUB_R: + //c.sub x{dst}, x{src} + buf.emit(rvc(rv64::C_SUB, regSS(isn.dst) + OffsetXC, regSS(isn.src) + OffsetXC)); + break; + case randomx::SuperscalarInstructionType::IXOR_R: + //c.xor x{dst}, x{src} + buf.emit(rvc(rv64::C_XOR, regSS(isn.dst) + OffsetXC, regSS(isn.src) + OffsetXC)); + break; + case randomx::SuperscalarInstructionType::IADD_RS: + { + int shift = isn.getModShift(); + if (shift == 0) { + //c.add x{dst}, x{src} + buf.emit(rvc(rv64::C_ADD, regSS(isn.dst), regSS(isn.src))); + } + else { +#ifdef __riscv_zba + //sh{1,2,3}add x{dst}, x{src}, x{dst} + buf.emit(rv64::SHXADD | rvrs2(regSS(isn.dst)) | rvrs1(regSS(isn.src)) | (shift << 13) | rvrd(regSS(isn.dst))); +#else + //slli x28, x{src}, {shift} + buf.emit(rvi(rv64::SLLI, SshTmp1Reg, regSS(isn.src), shift)); + //c.add x{dst}, x28 + buf.emit(rvc(rv64::C_ADD, regSS(isn.dst), SshTmp1Reg)); +#endif + } + } + break; + case randomx::SuperscalarInstructionType::IMUL_R: + //mul x{dst}, x{dst}, x{src} + buf.emit(rvi(rv64::MUL, regSS(isn.dst), regSS(isn.dst), regSS(isn.src))); + break; + case randomx::SuperscalarInstructionType::IROR_C: + { +#ifdef __riscv_zbb + int32_t imm = isn.getImm32() & 63; + //rori x{dst}, x{dst}, {imm} + buf.emit(rvi(rv64::RORI, regSS(isn.dst), regSS(isn.dst), imm)); +#else + int32_t immr = isn.getImm32() & 63; + int32_t imml = -immr & 63; + int32_t imml5 = imml >> 5; + int32_t imml40 = imml & 31; + //srli x28, x{dst}, {immr} + buf.emit(rvi(rv64::SRLI, SshTmp1Reg, regSS(isn.dst), immr)); + //c.slli x{dst}, {imml} + buf.emit(rvc(rv64::C_SLLI, imml5, regSS(isn.dst), imml40)); + //or x{dst}, x{dst}, x28 + buf.emit(rvi(rv64::OR, regSS(isn.dst), regSS(isn.dst), SshTmp1Reg)); +#endif + } + break; + case randomx::SuperscalarInstructionType::IADD_C7: + case randomx::SuperscalarInstructionType::IADD_C8: + case randomx::SuperscalarInstructionType::IADD_C9: + { + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + //x{dst} = x{dst} + {imm} + emitImm32(buf, imm, regSS(isn.dst), regSS(isn.dst), SshTmp1Reg); + } + break; + case randomx::SuperscalarInstructionType::IXOR_C7: + case randomx::SuperscalarInstructionType::IXOR_C8: + case randomx::SuperscalarInstructionType::IXOR_C9: + { + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + //x28 = {imm} + emitImm32(buf, imm, SshTmp1Reg); + //xor x{dst}, x{dst}, x28 + buf.emit(rvi(rv64::XOR, regSS(isn.dst), regSS(isn.dst), SshTmp1Reg)); + } + break; + case randomx::SuperscalarInstructionType::IMULH_R: + //mulhu x{dst}, x{dst}, x{src} + buf.emit(rvi(rv64::MULHU, regSS(isn.dst), regSS(isn.dst), regSS(isn.src))); + break; + case randomx::SuperscalarInstructionType::ISMULH_R: + //mulh x{dst}, x{dst}, x{src} + buf.emit(rvi(rv64::MULH, regSS(isn.dst), regSS(isn.dst), regSS(isn.src))); + break; + case randomx::SuperscalarInstructionType::IMUL_RCP: + //mul x{dst}, x{dst}, x31 + buf.emit(rvi(rv64::MUL, regSS(isn.dst), regSS(isn.dst), SshRcpReg)); + //load the next literal into x31 + emitRcpLiteral2(buf, reciprocalCache[isn.getImm32()], reciprocalCache.size()); + break; + default: + UNREACHABLE; + } + } + + size_t JitCompilerRV64::getCodeSize() { + return CodeSize; + } + + JitCompilerRV64::JitCompilerRV64() { + state.code = (uint8_t*)allocMemoryPages(CodeSize); + if (state.code == nullptr) + throw std::runtime_error("allocMemoryPages"); + state.emitAt(LiteralPoolOffset, codeLiterals, sizeLiterals); + state.emitAt(LiteralPoolSize, codeDataInit, sizeDataInit + sizePrologue + sizeLoopBegin); + entryDataInit = state.code + LiteralPoolSize; + entryProgram = state.code + LiteralPoolSize + sizeDataInit; + //jal x1, SuperscalarHash + emitJump(state, ReturnReg, LiteralPoolSize + offsetFixDataCall, SuperScalarHashOffset); + } + + JitCompilerRV64::~JitCompilerRV64() { + freePagedMemory(state.code, CodeSize); + } + + void JitCompilerRV64::enableAll() { + setPagesRWX(entryDataInit, ExecutableSize); + } + + void JitCompilerRV64::enableWriting() { + setPagesRW(entryDataInit, ExecutableSize); + } + + void JitCompilerRV64::enableExecution() { + setPagesRX(entryDataInit, ExecutableSize); + } + + void JitCompilerRV64::generateProgram(Program& prog, ProgramConfiguration& pcfg) { + emitProgramPrefix(state, prog, pcfg); + int32_t fixPos = state.codePos; + state.emit(codeDataRead, sizeDataRead); + //xor x8, x{readReg2}, x{readReg3} + state.emitAt(fixPos, rvi(rv64::XOR, Tmp1Reg, regR(pcfg.readReg2), regR(pcfg.readReg3))); + emitProgramSuffix(state, pcfg); + clearCache(state); + } + + void JitCompilerRV64::generateProgramLight(Program& prog, ProgramConfiguration& pcfg, uint32_t datasetOffset) { + emitProgramPrefix(state, prog, pcfg); + int32_t fixPos = state.codePos; + state.emit(codeDataReadLight, sizeDataReadLight); + //xor x8, x{readReg2}, x{readReg3} + state.emitAt(fixPos, rvi(rv64::XOR, Tmp1Reg, regR(pcfg.readReg2), regR(pcfg.readReg3))); + int32_t imm = datasetOffset / CacheLineSize; + int32_t limm = (imm << 20) >> 20; + int32_t uimm = (imm >> 12) + (limm < 0); + //lui x9, {uimm} + state.emitAt(fixPos + 4, rv64::LUI | (uimm << 12) | rvrd(Tmp2Reg)); + //addi x9, x9, {limm} + state.emitAt(fixPos + 8, rvi(rv64::ADDI, Tmp2Reg, Tmp2Reg, limm)); + fixPos += offsetFixLoopCall; + //jal x1, SuperscalarHash + emitJump(state, ReturnReg, fixPos, SuperScalarHashOffset); + emitProgramSuffix(state, pcfg); + clearCache(state); + } + + void JitCompilerRV64::generateSuperscalarHash(SuperscalarProgram programs[RANDOMX_CACHE_ACCESSES], std::vector& reciprocalCache) { + state.codePos = SuperScalarHashOffset; + state.rcpCount = 0; + state.emit(codeSshInit, sizeSshInit); + for (unsigned j = 0; j < RANDOMX_CACHE_ACCESSES; ++j) { + SuperscalarProgram& prog = programs[j]; + for (unsigned i = 0; i < prog.getSize(); ++i) { + Instruction instr = prog(i); + generateSuperscalarCode(state, instr, reciprocalCache); + } + state.emit(codeSshLoad, sizeSshLoad); + if (j < RANDOMX_CACHE_ACCESSES - 1) { + int32_t fixPos = state.codePos; + state.emit(codeSshPrefetch, sizeSshPrefetch); + //and x7, x{addrReg}, x7 + state.emitAt(fixPos, rvi(rv64::AND, SuperscalarReg, regSS(prog.getAddressRegister()), SuperscalarReg)); + } + } + state.emit(rvc(rv64::C_RET, 0, 0)); + clearCache(state); + } + + static void v1_IADD_RS(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + int shift = isn.getModShift(); + if (shift == 0) { + //c.add x{dst}, x{src} + state.emit(rvc(rv64::C_ADD, regR(isn.dst), regR(isn.src))); + } + else { +#ifdef __riscv_zba + //sh{1,2,3}add x{dst}, x{src}, x{dst} + state.emit(rv64::SHXADD | rvrs2(regR(isn.dst)) | rvrs1(regR(isn.src)) | (shift << 13) | rvrd(regR(isn.dst))); +#else + //slli x8, x{src}, {shift} + state.emit(rvi(rv64::SLLI, Tmp1Reg, regR(isn.src), shift)); + //c.add x{dst}, x8 + state.emit(rvc(rv64::C_ADD, regR(isn.dst), Tmp1Reg)); +#endif + } + if (isn.dst == RegisterNeedsDisplacement) { + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + //x{dst} = x{dst} + {imm} + emitImm32(state, imm, regR(isn.dst), regR(isn.dst), Tmp1Reg); + } + } + + static void v1_IADD_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //c.add x{dst}, x8 + state.emit(rvc(rv64::C_ADD, regR(isn.dst), Tmp1Reg)); + } + + static void v1_ISUB_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + if (isn.src != isn.dst) { + //sub x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::SUB, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + else { + int32_t imm = unsigned32ToSigned2sCompl(-isn.getImm32()); //convert to add + //x{dst} = x{dst} + {-imm} + emitImm32(state, imm, regR(isn.dst), regR(isn.dst), Tmp1Reg); + } + } + + static void v1_ISUB_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //sub x{dst}, x{dst}, x8 + state.emit(rvi(rv64::SUB, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + + static void v1_IMUL_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + if (isn.src != isn.dst) { + //mul x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + else { + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + //x8 = {imm} + emitImm32(state, imm, Tmp1Reg); + //mul x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + } + + static void v1_IMUL_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //mul x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + + static void v1_IMULH_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + //mulhu x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::MULHU, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + + static void v1_IMULH_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //mulhu x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MULHU, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + + static void v1_ISMULH_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + //mulh x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::MULH, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + + static void v1_ISMULH_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //mulh x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MULH, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + + static void v1_IMUL_RCP(HANDLER_ARGS) { + const uint32_t divisor = isn.getImm32(); + if (!isZeroOrPowerOf2(divisor)) { + state.registerUsage[isn.dst] = i; + if (state.rcpCount < 4) { + //mul x{dst}, x{dst}, x{rcp} + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), regRcp(state.rcpCount))); + } + else if (state.rcpCount < 10) { + //fmv.x.d x8, f{rcp} + state.emit(rvi(rv64::FMV_X_D, Tmp1Reg, regRcpF(state.rcpCount))); + //mul x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + else { + int32_t offset = RcpLiteralsOffset + state.rcpCount * 8; + //ld x8, {offset}(x3) + state.emit(rvi(rv64::LD, Tmp1Reg, LiteralPoolReg, offset)); + //mul x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + emitRcpLiteral1(state, randomx_reciprocal_fast(divisor)); + } + } + + static void v1_INEG_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + //sub x{dst}, x0, x{dst} + state.emit(rvi(rv64::SUB, regR(isn.dst), 0, regR(isn.dst))); + } + + static void v1_IXOR_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + if (isn.src != isn.dst) { + //xor x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::XOR, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + else { + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + //x8 = {imm} + emitImm32(state, imm, Tmp1Reg); + //xor x{dst}, x{dst}, x8 + state.emit(rvi(rv64::XOR, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + } + + static void v1_IXOR_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //xor x{dst}, x{dst}, x8 + state.emit(rvi(rv64::XOR, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + + static void v1_IROR_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; +#ifdef __riscv_zbb + if (isn.src != isn.dst) { + //ror x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::ROR, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + else { + int32_t imm = isn.getImm32() & 63; + //rori x{dst}, x{dst}, {imm} + state.emit(rvi(rv64::RORI, regR(isn.dst), regR(isn.dst), imm)); + } +#else + if (isn.src != isn.dst) { + //sub x8, x0, x{src} + state.emit(rvi(rv64::SUB, Tmp1Reg, 0, regR(isn.src))); + //srl x9, x{dst}, x{src} + state.emit(rvi(rv64::SRL, Tmp2Reg, regR(isn.dst), regR(isn.src))); + //sll x{dst}, x{dst}, x8 + state.emit(rvi(rv64::SLL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + //or x{dst}, x{dst}, x9 + state.emit(rvi(rv64::OR, regR(isn.dst), regR(isn.dst), Tmp2Reg)); + } + else { + int32_t immr = isn.getImm32() & 63; + int32_t imml = -immr & 63; + int32_t imml5 = imml >> 5; + int32_t imml40 = imml & 31; + //srli x8, x{dst}, {immr} + state.emit(rvi(rv64::SRLI, Tmp1Reg, regR(isn.dst), immr)); + //c.slli x{dst}, {imml} + state.emit(rvc(rv64::C_SLLI, imml5, regR(isn.dst), imml40)); + //or x{dst}, x{dst}, x8 + state.emit(rvi(rv64::OR, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } +#endif + } + + static void v1_IROL_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; +#ifdef __riscv_zbb + if (isn.src != isn.dst) { + //rol x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::ROL, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + else { + int32_t imm = -isn.getImm32() & 63; + //rori x{dst}, x{dst}, {imm} + state.emit(rvi(rv64::RORI, regR(isn.dst), regR(isn.dst), imm)); + } +#else + if (isn.src != isn.dst) { + //sub x8, x0, x{src} + state.emit(rvi(rv64::SUB, Tmp1Reg, 0, regR(isn.src))); + //sll x9, x{dst}, x{src} + state.emit(rvi(rv64::SLL, Tmp2Reg, regR(isn.dst), regR(isn.src))); + //srl x{dst}, x{dst}, x8 + state.emit(rvi(rv64::SRL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + //or x{dst}, x{dst}, x9 + state.emit(rvi(rv64::OR, regR(isn.dst), regR(isn.dst), Tmp2Reg)); + } + else { + int32_t imml = isn.getImm32() & 63; + int32_t immr = -imml & 63; + int32_t imml5 = imml >> 5; + int32_t imml40 = imml & 31; + //srli x8, x{dst}, {immr} + state.emit(rvi(rv64::SRLI, Tmp1Reg, regR(isn.dst), immr)); + //c.slli x{dst}, {imml} + state.emit(rvc(rv64::C_SLLI, imml5, regR(isn.dst), imml40)); + //or x{dst}, x{dst}, x8 + state.emit(rvi(rv64::OR, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } +#endif + } + + static void v1_ISWAP_R(HANDLER_ARGS) { + if (isn.src != isn.dst) { + state.registerUsage[isn.dst] = i; + state.registerUsage[isn.src] = i; + //c.mv x8, x{dst} + state.emit(rvc(rv64::C_MV, Tmp1Reg, regR(isn.dst))); + //c.mv x{dst}, x{src} + state.emit(rvc(rv64::C_MV, regR(isn.dst), regR(isn.src))); + //c.mv x{src}, x8 + state.emit(rvc(rv64::C_MV, regR(isn.src), Tmp1Reg)); + } + } + + static void v1_FSWAP_R(HANDLER_ARGS) { + //fmv.d f24, f{dst_lo} + state.emit(rvi(rv64::FMV_D, Tmp1RegF, regLoF(isn.dst), regLoF(isn.dst))); + //fmv.d f{dst_lo}, f{dst_hi} + state.emit(rvi(rv64::FMV_D, regLoF(isn.dst), regHiF(isn.dst), regHiF(isn.dst))); + //fmv.d f{dst_hi}, f24 + state.emit(rvi(rv64::FMV_D, regHiF(isn.dst), Tmp1RegF, Tmp1RegF)); + } + + static void v1_FADD_R(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + isn.src %= RegisterCountFlt; + //fadd.d f{dst_lo}, f{dst_lo}, f{src_lo} + state.emit(rvi(rv64::FADD_D, regLoF(isn.dst), regLoF(isn.dst), regLoA(isn.src))); + //fadd.d f{dst_hi}, f{dst_hi}, f{src_hi} + state.emit(rvi(rv64::FADD_D, regHiF(isn.dst), regHiF(isn.dst), regHiA(isn.src))); + } + + static void v1_FADD_M(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + //x9 = mem + genAddressReg(state, isn); + //lw x8, 0(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, Tmp1Reg + OffsetXC)); + //lw x9, 4(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, 16 + Tmp2Reg + OffsetXC)); + //fcvt.d.w f24, x8 + state.emit(rvi(rv64::FCVT_D_W, Tmp1RegF, Tmp1Reg)); + //fcvt.d.w f25, x9 + state.emit(rvi(rv64::FCVT_D_W, Tmp2RegF, Tmp2Reg)); + //fadd.d f{dst_lo}, f{dst_lo}, f24 + state.emit(rvi(rv64::FADD_D, regLoF(isn.dst), regLoF(isn.dst), Tmp1RegF)); + //fadd.d f{dst_hi}, f{dst_hi}, f25 + state.emit(rvi(rv64::FADD_D, regHiF(isn.dst), regHiF(isn.dst), Tmp2RegF)); + } + + static void v1_FSUB_R(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + isn.src %= RegisterCountFlt; + //fsub.d f{dst_lo}, f{dst_lo}, f{src_lo} + state.emit(rvi(rv64::FSUB_D, regLoF(isn.dst), regLoF(isn.dst), regLoA(isn.src))); + //fsub.d f{dst_hi}, f{dst_hi}, f{src_hi} + state.emit(rvi(rv64::FSUB_D, regHiF(isn.dst), regHiF(isn.dst), regHiA(isn.src))); + } + + static void v1_FSUB_M(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + //x9 = mem + genAddressReg(state, isn); + //c.lw x8, 0(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, Tmp1Reg + OffsetXC)); + //c.lw x9, 4(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, 16 + Tmp2Reg + OffsetXC)); + //fcvt.d.w f24, x8 + state.emit(rvi(rv64::FCVT_D_W, Tmp1RegF, Tmp1Reg)); + //fcvt.d.w f25, x9 + state.emit(rvi(rv64::FCVT_D_W, Tmp2RegF, Tmp2Reg)); + //fsub.d f{dst_lo}, f{dst_lo}, f24 + state.emit(rvi(rv64::FSUB_D, regLoF(isn.dst), regLoF(isn.dst), Tmp1RegF)); + //fsub.d f{dst_hi}, f{dst_hi}, f25 + state.emit(rvi(rv64::FSUB_D, regHiF(isn.dst), regHiF(isn.dst), Tmp2RegF)); + } + + static void v1_FSCAL_R(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + //fmv.x.d x8, f{dst_lo} + state.emit(rvi(rv64::FMV_X_D, Tmp1Reg, regLoF(isn.dst))); + //fmv.x.d x9, f{dst_hi} + state.emit(rvi(rv64::FMV_X_D, Tmp2Reg, regHiF(isn.dst))); + //c.xor x8, x12 + state.emit(rvc(rv64::C_XOR, Tmp1Reg + OffsetXC, MaskFscalReg + OffsetXC)); + //c.xor x9, x12 + state.emit(rvc(rv64::C_XOR, Tmp2Reg + OffsetXC, MaskFscalReg + OffsetXC)); + //fmv.d.x f{dst_lo}, x8 + state.emit(rvi(rv64::FMV_D_X, regLoF(isn.dst), Tmp1Reg)); + //fmv.d.x f{dst_hi}, x9 + state.emit(rvi(rv64::FMV_D_X, regHiF(isn.dst), Tmp2Reg)); + } + + static void v1_FMUL_R(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + isn.src %= RegisterCountFlt; + //fmul.d f{dst_lo}, f{dst_lo}, f{src_lo} + state.emit(rvi(rv64::FMUL_D, regLoE(isn.dst), regLoE(isn.dst), regLoA(isn.src))); + //fmul.d f{dst_hi}, f{dst_hi}, f{src_hi} + state.emit(rvi(rv64::FMUL_D, regHiE(isn.dst), regHiE(isn.dst), regHiA(isn.src))); + } + + static void v1_FDIV_M(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + //x9 = mem + genAddressReg(state, isn); + //lw x8, 0(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, Tmp1Reg + OffsetXC)); + //lw x9, 4(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, 16 + Tmp2Reg + OffsetXC)); + //fcvt.d.w f24, x8 + state.emit(rvi(rv64::FCVT_D_W, Tmp1RegF, Tmp1Reg)); + //fcvt.d.w f25, x9 + state.emit(rvi(rv64::FCVT_D_W, Tmp2RegF, Tmp2Reg)); + //fmv.x.d x8, f24 + state.emit(rvi(rv64::FMV_X_D, Tmp1Reg, Tmp1RegF)); + //fmv.x.d x9, f25 + state.emit(rvi(rv64::FMV_X_D, Tmp2Reg, Tmp2RegF)); + //c.and x8, x13 + state.emit(rvc(rv64::C_AND, Tmp1Reg + OffsetXC, MaskEclear + OffsetXC)); + //c.and x9, x13 + state.emit(rvc(rv64::C_AND, Tmp2Reg + OffsetXC, MaskEclear + OffsetXC)); + //c.or x8, x14 + state.emit(rvc(rv64::C_OR, Tmp1Reg + OffsetXC, MaskEsetLo + OffsetXC)); + //c.or x9, x15 + state.emit(rvc(rv64::C_OR, Tmp2Reg + OffsetXC, MaskEsetHi + OffsetXC)); + //fmv.d.x f24, x8 + state.emit(rvi(rv64::FMV_D_X, Tmp1RegF, Tmp1Reg)); + //fmv.d.x f25, x9 + state.emit(rvi(rv64::FMV_D_X, Tmp2RegF, Tmp2Reg)); + //fdiv.d f{dst_lo}, f{dst_lo}, f24 + state.emit(rvi(rv64::FDIV_D, regLoE(isn.dst), regLoE(isn.dst), Tmp1RegF)); + //fdiv.d f{dst_hi}, f{dst_hi}, f25 + state.emit(rvi(rv64::FDIV_D, regHiE(isn.dst), regHiE(isn.dst), Tmp2RegF)); + } + + static void v1_FSQRT_R(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + //fsqrt.d f{dst_lo}, f{dst_lo} + state.emit(rvi(rv64::FSQRT_D, regLoE(isn.dst), regLoE(isn.dst))); + //fsqrt.d f{dst_hi}, f{dst_hi} + state.emit(rvi(rv64::FSQRT_D, regHiE(isn.dst), regHiE(isn.dst))); + } + + static void v1_CBRANCH(HANDLER_ARGS) { + int reg = isn.dst; + int target = state.registerUsage[reg] + 1; + int shift = isn.getModCond() + ConditionOffset; + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + imm |= (1UL << shift); + if (ConditionOffset > 0 || shift > 0) + imm &= ~(1UL << (shift - 1)); + //x8 = branchMask + emitImm32(state, (int32_t)ConditionMask << shift, Tmp1Reg); + //x{dst} += {imm} + emitImm32(state, imm, regR(isn.dst), regR(isn.dst), Tmp2Reg); + //and x8, x8, x{dst} + state.emit(rvi(rv64::AND, Tmp1Reg, Tmp1Reg, regR(isn.dst))); + int32_t targetPos = state.instructionOffsets[target]; + int offset = targetPos - state.codePos; + if (offset >= -256) { //C.BEQZ only has a range of 256B + //c.beqz x8, {offset} + int imm8 = 1; //sign bit is always 1 + int imm21 = offset & 6; //offset[2:1] + int imm5 = (offset >> 5) & 1; //offset[5] + int imm43 = offset & 24; //offset[4:3] + int imm76 = (offset >> 3) & 24; //offset[7:6] + state.emit(rvc(rv64::C_BEQZ, imm8, imm43 + (Tmp1Reg + OffsetXC), imm76 + imm21 + imm5)); + } + else if (offset >= -4096) { //BEQ only has a range of 4KB + //beq x8, x0, offset + int imm12 = 1 << 11; //sign bit is always 1 + int imm105 = offset & 2016; //offset[10:5] + int imm41 = offset & 30; //offset[4:1] + int imm11 = (offset >> 11) & 1; //offset[11] + state.emit(rvi(rv64::BEQ, imm41 + imm11, Tmp1Reg, imm12 + imm105)); + } + else { + //c.bnez x8, +6 + state.emit(rvc(rv64::C_BNEZ, Tmp1Reg + OffsetXC, 6)); + //j targetPos + emitJump(state, 0, state.codePos, targetPos); + state.codePos += 4; + } + //mark all registers as used + for (unsigned j = 0; j < RegistersCount; ++j) { + state.registerUsage[j] = i; + } + } + + static void v1_CFROUND(HANDLER_ARGS) { + int32_t imm = (isn.getImm32() - 2) & 63; //-2 to avoid a later left shift to multiply by 4 + if (imm != 0) { +#ifdef __riscv_zbb + //rori x8, x{src}, {imm} + state.emit(rvi(rv64::RORI, Tmp1Reg, regR(isn.src), imm)); +#else + int32_t imml = -imm & 63; + //srli x8, x{src}, {imm} + state.emit(rvi(rv64::SRLI, Tmp1Reg, regR(isn.src), imm)); + //slli x9, x{src}, {imml} + state.emit(rvi(rv64::SLLI, Tmp2Reg, regR(isn.src), imml)); + //c.or x8, x9 + state.emit(rvc(rv64::C_OR, Tmp1Reg + OffsetXC, Tmp2Reg + OffsetXC)); +#endif + //c.andi x8, 12 + state.emit(rvc(rv64::C_ANDI, Tmp1Reg + OffsetXC, 12)); + } + else { + //and x8, x{src}, 12 + state.emit(rvi(rv64::ANDI, Tmp1Reg, regR(isn.src), 12)); + } + //c.add x8, x3 + state.emit(rvc(rv64::C_ADD, Tmp1Reg, LiteralPoolReg)); + //c.lw x8, 64(x8) + state.emit(rvc(rv64::C_LW, Tmp1Reg + OffsetXC, 8 + Tmp1Reg + OffsetXC)); + //fsrm x8 + state.emit(rvi(rv64::FSRM, 0, Tmp1Reg, 0)); + } + + static void v1_ISTORE(HANDLER_ARGS) { + genAddressRegDst(state, isn); + //sd x{src}, 0(x9) + state.emit(rvi(rv64::SD, 0, Tmp2Reg, regR(isn.src))); + } + + static void v1_NOP(HANDLER_ARGS) { + } +} + +#include "instruction_weights.hpp" + +namespace { + +#define INST_HANDLE1(x) REPN(&randomx::v1_##x, WT(x)) +#define INST_HANDLE2(x) REPN(&randomx::v2_##x, WT(x)) + + InstructionHandler* opcodeMap1[256] = { + INST_HANDLE1(IADD_RS) + INST_HANDLE1(IADD_M) + INST_HANDLE1(ISUB_R) + INST_HANDLE1(ISUB_M) + INST_HANDLE1(IMUL_R) + INST_HANDLE1(IMUL_M) + INST_HANDLE1(IMULH_R) + INST_HANDLE1(IMULH_M) + INST_HANDLE1(ISMULH_R) + INST_HANDLE1(ISMULH_M) + INST_HANDLE1(IMUL_RCP) + INST_HANDLE1(INEG_R) + INST_HANDLE1(IXOR_R) + INST_HANDLE1(IXOR_M) + INST_HANDLE1(IROR_R) + INST_HANDLE1(IROL_R) + INST_HANDLE1(ISWAP_R) + INST_HANDLE1(FSWAP_R) + INST_HANDLE1(FADD_R) + INST_HANDLE1(FADD_M) + INST_HANDLE1(FSUB_R) + INST_HANDLE1(FSUB_M) + INST_HANDLE1(FSCAL_R) + INST_HANDLE1(FMUL_R) + INST_HANDLE1(FDIV_M) + INST_HANDLE1(FSQRT_R) + INST_HANDLE1(CBRANCH) + INST_HANDLE1(CFROUND) + INST_HANDLE1(ISTORE) + INST_HANDLE1(NOP) + }; +} \ No newline at end of file diff --git a/src/RandomX/src/jit_compiler_rv64.hpp b/src/RandomX/src/jit_compiler_rv64.hpp new file mode 100644 index 000000000..aaae57e36 --- /dev/null +++ b/src/RandomX/src/jit_compiler_rv64.hpp @@ -0,0 +1,69 @@ +/* +Copyright (c) 2023 tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include +#include +#include "jit_compiler.hpp" + +namespace randomx { + + class Program; + struct ProgramConfiguration; + class SuperscalarProgram; + class Instruction; + + class JitCompilerRV64 { + public: + JitCompilerRV64(); + ~JitCompilerRV64(); + void generateProgram(Program&, ProgramConfiguration&); + void generateProgramLight(Program&, ProgramConfiguration&, uint32_t); + void generateSuperscalarHash(SuperscalarProgram programs[RANDOMX_CACHE_ACCESSES], std::vector&); + void generateDatasetInitCode() {} + ProgramFunc* getProgramFunc() { + return (ProgramFunc*)entryProgram; + } + DatasetInitFunc* getDatasetInitFunc() { + return (DatasetInitFunc*)entryDataInit; + } + uint8_t* getCode() { + return state.code; + } + size_t getCodeSize(); + void enableWriting(); + void enableExecution(); + void enableAll(); + private: + CompilerState state; + void* entryDataInit; + void* entryProgram; + }; +} diff --git a/src/RandomX/src/jit_compiler_rv64_static.S b/src/RandomX/src/jit_compiler_rv64_static.S new file mode 100644 index 000000000..240bbf5f4 --- /dev/null +++ b/src/RandomX/src/jit_compiler_rv64_static.S @@ -0,0 +1,1235 @@ +/* +Copyright (c) 2023 tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#define DECL(x) x + +.text +.option rvc + +#include "configuration.h" + +.global DECL(randomx_riscv64_literals) +.global DECL(randomx_riscv64_literals_end) +.global DECL(randomx_riscv64_data_init) +.global DECL(randomx_riscv64_fix_data_call) +.global DECL(randomx_riscv64_prologue) +.global DECL(randomx_riscv64_loop_begin) +.global DECL(randomx_riscv64_data_read) +.global DECL(randomx_riscv64_data_read_light) +.global DECL(randomx_riscv64_fix_loop_call) +.global DECL(randomx_riscv64_spad_store) +.global DECL(randomx_riscv64_spad_store_hardaes) +.global DECL(randomx_riscv64_spad_store_softaes) +.global DECL(randomx_riscv64_loop_end) +.global DECL(randomx_riscv64_fix_continue_loop) +.global DECL(randomx_riscv64_epilogue) +.global DECL(randomx_riscv64_softaes) +.global DECL(randomx_riscv64_program_end) +.global DECL(randomx_riscv64_ssh_init) +.global DECL(randomx_riscv64_ssh_load) +.global DECL(randomx_riscv64_ssh_prefetch) +.global DECL(randomx_riscv64_ssh_end) + +/* The literal pool can fit at most 494 IMUL_RCP literals */ +#if RANDOMX_PROGRAM_SIZE > 494 + #error RANDOMX_PROGRAM_SIZE larger than 494 is not supported. +#endif + +#define RANDOMX_CACHE_MASK (RANDOMX_ARGON_MEMORY*16-1) + +/* shared literal pool: 4 KB */ + /* space for 256 IMUL_RCP literals -2048 */ + /* filled by JIT compiler */ +DECL(randomx_riscv64_literals): +literal_pool: + /* SuperscalarHash constants +0 */ + .dword 6364136223846793005 + .dword 9298411001130361340 + .dword 12065312585734608966 + .dword 9306329213124626780 + .dword 5281919268842080866 + .dword 10536153434571861004 + .dword 3398623926847679864 + .dword 9549104520008361294 + /* CFROUND lookup table +64 */ + .word 0x00000000 /* RTN */ + .word 0x00000002 /* RDN */ + .word 0x00000003 /* RUP */ + .word 0x00000001 /* RTZ */ + /* mask literals +80,+84,+88,+92,+96,+104 */ + .word (RANDOMX_SCRATCHPAD_L1-8) + .word (RANDOMX_SCRATCHPAD_L2-8) + .word (RANDOMX_SCRATCHPAD_L3-64) + .word (RANDOMX_DATASET_BASE_SIZE-64) + .dword 0x80f0000000000000 + .dword 0x00ffffffffffffff +DECL(randomx_riscv64_literals_end): + /* E reg. set masks, +112,+120 */ + .dword 0 /* filled by JIT compiler */ + .dword 0 /* filled by JIT compiler */ + /* soft AES table addresses, +128,+136 */ + .dword 0 /* filled by JIT compiler */ + .dword 0 /* filled by JIT compiler */ + /* space for 238 IMUL_RCP literals, +144 */ + .fill 238,8,0 /* filled by JIT compiler */ + +/* ================================= */ +/* Dataset init function entry point */ +/* ================================= */ + +/* Register allocation: + ---------------------- + x0 -> zero + x1 -> temp/return address + x2 -> stack pointer (sp) + x3 -> literal pool pointer + x5 -> dataset pointer + x6 -> cache pointer + x7 -> temp/itemNumber + x8-x15 -> SuperscalarHash registers + x16 -> itemNumber + x17 -> endItem + x28-x31 -> temp + + Stack layout: + ------------------------ + sp+ + 0 -> return address + 8 -> saved x3 + 16 -> saved x8-x9 + 32 -> caller stack +*/ +DECL(randomx_riscv64_data_init): + addi sp, sp, -32 + /* dataset ptr */ + mv x5, x11 + /* cache->memory */ + ld x6, 0(x10) + /* callee saved registers */ + sd x1, 0(sp) + sd x3, 8(sp) + /* literal pool */ + lla x3, literal_pool + sd x8, 16(sp) + sd x9, 24(sp) + /* startItem */ + mv x16, x12 + /* endItem */ + mv x17, x13 +init_item: + mv x7, x16 +DECL(randomx_riscv64_fix_data_call): + jal superscalar_hash /* JIT compiler will adjust the offset */ + sd x8, 0(x5) + sd x9, 8(x5) + sd x10, 16(x5) + sd x11, 24(x5) + sd x12, 32(x5) + sd x13, 40(x5) + sd x14, 48(x5) + sd x15, 56(x5) + addi x5, x5, 64 + addi x16, x16, 1 + bltu x16, x17, init_item + ld x1, 0(sp) + ld x3, 8(sp) + ld x8, 16(sp) + ld x9, 24(sp) + addi sp, sp, 32 + ret + +/* ====================================== */ +/* Program execution function entry point */ +/* ====================================== */ + +/* Register allocation: + ---------------------- + x0 -> zero + x1 -> temp/scratchpad L3 mask + x2 -> stack pointer (sp) + x3 -> literal pool pointer + x5 -> scratchpad pointer + x6 -> dataset/cache pointer + x7 -> temp/next dataset access + x8 -> temp + x9 -> temp + x10 -> scratchpad L1 mask (0x0000000000003ff8) + x11 -> scratchpad L2 mask (0x000000000003fff8) + x12 -> FSCAL_R mask (0x80f0000000000000) + x13 -> E reg. clear mask (0x00ffffffffffffff) + x14 -> E reg. set mask (0x3*00000000******) + x15 -> E reg. set mask (0x3*00000000******) + x16-x23 -> VM registers "r0"-"r7" + x24 -> iteration counter "ic" + x25 -> VM registers "mx", "ma" + x26 -> spAddr0 + x27 -> spAddr1 + x28-x31 -> temp/literals for IMUL_RCP (4x) + + (Note: We avoid using x4 because it breaks debugging with gdb.) + + f0-f7 -> VM registers "f0"-"f3" + f8-f15 -> VM registers "e0"-"e3" + f16-f23 -> VM registers "a0"-"a3" + f24-f25 -> temp + f26-f31 -> literals for IMUL_RCP (6x) + + Stack layout: + ------------------------ + sp+ + 0 -> return address + 8 -> register file ptr + 16 -> saved x3-x4 + 32 -> saved x8-x9 + 48 -> saved x18-x27 + 128 -> saved f8-f9 + 144 -> saved f18-f27 + 224 -> caller stack +*/ + +DECL(randomx_riscv64_prologue): + addi sp, sp, -224 + /* scratchpad pointer */ + mv x5, x12 + /* register file pointer */ + sd x10, 8(sp) + /* callee saved registers */ + sd x3, 16(sp) + sd x8, 32(sp) + sd x9, 40(sp) + sd x18, 48(sp) + sd x19, 56(sp) + sd x20, 64(sp) + sd x21, 72(sp) + sd x22, 80(sp) + sd x23, 88(sp) + sd x24, 96(sp) + sd x25, 104(sp) + sd x26, 112(sp) + sd x27, 120(sp) + fsd f8, 128(sp) + fsd f9, 136(sp) + fsd f18, 144(sp) + fsd f19, 152(sp) + fsd f20, 160(sp) + fsd f21, 168(sp) + fsd f22, 176(sp) + fsd f23, 184(sp) + fsd f24, 192(sp) + fsd f25, 200(sp) + fsd f26, 208(sp) + fsd f27, 216(sp) + /* iteration counter */ + mv x24, x13 + /* return address */ + sd x1, 0(sp) + /* literal pool */ + lla x3, literal_pool + /* load (ma, mx) */ + ld x25, 0(x11) + /* dataset ptr */ + ld x6, 8(x11) + /* load dataset mask */ + lwu x1, 92(x3) + /* zero registers r0-r3, load a0-a1 */ + li x16, 0 + fld f16, 192(x10) + li x17, 0 + fld f17, 200(x10) + srli x7, x25, 32 /* x7 = ma */ + li x18, 0 + fld f18, 208(x10) + mv x27, x7 /* x27 = ma */ + li x19, 0 + fld f19, 216(x10) + /* set dataset read address */ + and x7, x7, x1 + add x7, x7, x6 + /* zero registers r4-r7, load a2-a3 */ + li x20, 0 + fld f20, 224(x10) + li x21, 0 + fld f21, 232(x10) + li x22, 0 + fld f22, 240(x10) + li x23, 0 + fld f23, 248(x10) + /* load L3 mask */ + lwu x1, 88(x3) + /* load scratchpad masks */ + lwu x10, 80(x3) + lwu x11, 84(x3) + /* set spAddr0, spAddr1 */ + and x26, x25, x1 + and x27, x27, x1 + add x26, x26, x5 + add x27, x27, x5 + /* align L3 mask */ + addi x1, x1, 56 + /* FSCAL, E reg. masks */ + ld x12, 96(x3) + ld x13, 104(x3) + ld x14, 112(x3) + ld x15, 120(x3) + /* IMUL_RCP literals */ + fld f26, 176(x3) + fld f27, 184(x3) + fld f28, 192(x3) + fld f29, 200(x3) + fld f30, 208(x3) + fld f31, 216(x3) + +.balign 4 +DECL(randomx_riscv64_loop_begin): +loop_begin: + /* mix integer registers */ + ld x8, 0(x26) + ld x9, 8(x26) + ld x30, 16(x26) + ld x31, 24(x26) + xor x16, x16, x8 + ld x8, 32(x26) + xor x17, x17, x9 + ld x9, 40(x26) + xor x18, x18, x30 + ld x30, 48(x26) + xor x19, x19, x31 + ld x31, 56(x26) + xor x20, x20, x8 + lw x8, 0(x27) + xor x21, x21, x9 + lw x9, 4(x27) + xor x22, x22, x30 + lw x30, 8(x27) + xor x23, x23, x31 + lw x31, 12(x27) + /* load F registers */ + fcvt.d.w f0, x8 + lw x8, 16(x27) + fcvt.d.w f1, x9 + lw x9, 20(x27) + fcvt.d.w f2, x30 + lw x30, 24(x27) + fcvt.d.w f3, x31 + lw x31, 28(x27) + fcvt.d.w f4, x8 + lw x8, 32(x27) + fcvt.d.w f5, x9 + lw x9, 36(x27) + fcvt.d.w f6, x30 + lw x30, 40(x27) + fcvt.d.w f7, x31 + lw x31, 44(x27) + /* load E registers */ + fcvt.d.w f8, x8 + lw x8, 48(x27) + fcvt.d.w f9, x9 + lw x9, 52(x27) + fcvt.d.w f10, x30 + lw x30, 56(x27) + fcvt.d.w f11, x31 + lw x31, 60(x27) + fcvt.d.w f12, x8 + fmv.x.d x8, f8 + fcvt.d.w f13, x9 + fmv.x.d x9, f9 + fcvt.d.w f14, x30 + fmv.x.d x30, f10 + fcvt.d.w f15, x31 + fmv.x.d x31, f11 + and x8, x8, x13 + and x9, x9, x13 + or x8, x8, x14 + or x9, x9, x15 + and x30, x30, x13 + and x31, x31, x13 + or x30, x30, x14 + or x31, x31, x15 + fmv.d.x f8, x8 + fmv.d.x f9, x9 + fmv.d.x f10, x30 + fmv.d.x f11, x31 + fmv.x.d x8, f12 + fmv.x.d x9, f13 + fmv.x.d x30, f14 + fmv.x.d x31, f15 + and x8, x8, x13 + and x9, x9, x13 + or x8, x8, x14 + or x9, x9, x15 + fmv.d.x f12, x8 + fmv.d.x f13, x9 + and x30, x30, x13 + and x31, x31, x13 + or x30, x30, x14 + or x31, x31, x15 + fmv.d.x f14, x30 + fmv.d.x f15, x31 + /* reload clobbered IMUL_RCP regs */ + ld x28, 144(x3) + ld x29, 152(x3) + ld x30, 160(x3) + ld x31, 168(x3) + +DECL(randomx_riscv64_data_read): + xor x8, x20, x22 /* JIT compiler will adjust the registers */ + /* load dataset mask */ + lwu x1, 92(x3) + /* zero-extend x8 */ +#ifdef __riscv_zba + zext.w x8, x8 +#else + slli x8, x8, 32 + srli x8, x8, 32 +#endif + /* update "mx" */ + xor x25, x25, x8 + /* read dataset and update registers */ + ld x8, 0(x7) + ld x9, 8(x7) + ld x30, 16(x7) + ld x31, 24(x7) + xor x16, x16, x8 + ld x8, 32(x7) + xor x17, x17, x9 + ld x9, 40(x7) + xor x18, x18, x30 + ld x30, 48(x7) + xor x19, x19, x31 + ld x31, 56(x7) + xor x20, x20, x8 + /* calculate the next dataset address */ + and x7, x25, x1 + xor x21, x21, x9 + add x7, x7, x6 + xor x22, x22, x30 + /* prefetch - doesn't seem to have any effect */ + /* ld x0, 0(x7) */ + xor x23, x23, x31 + /* swap mx <-> ma */ +#ifdef __riscv_zbb + rori x25, x25, 32 +#else + srli x9, x25, 32 + slli x25, x25, 32 + or x25, x25, x9 +#endif + +DECL(randomx_riscv64_data_read_light): + xor x8, x20, x22 /* JIT compiler will adjust the registers */ + /* load dataset offset */ + lui x9, 0x02000 /* JIT compiler will adjust the immediate */ + addi x9, x9, -64 + /* load dataset mask */ + lwu x1, 92(x3) + /* swap mx <-> ma */ +#ifdef __riscv_zbb + rori x25, x25, 32 +#else + srli x31, x25, 32 + slli x25, x25, 32 + or x25, x25, x31 +#endif + slli x8, x8, 32 + /* update "mx" */ + xor x25, x25, x8 + /* the next dataset item */ + and x7, x25, x1 + srli x7, x7, 6 + add x7, x7, x9 +DECL(randomx_riscv64_fix_loop_call): + jal superscalar_hash /* JIT compiler will adjust the offset */ + xor x16, x16, x8 + xor x17, x17, x9 + xor x18, x18, x10 + xor x19, x19, x11 + xor x20, x20, x12 + xor x21, x21, x13 + xor x22, x22, x14 + xor x23, x23, x15 + /* restore clobbered registers */ + lwu x10, 80(x3) + lwu x11, 84(x3) + ld x12, 96(x3) + ld x13, 104(x3) + ld x14, 112(x3) + ld x15, 120(x3) + +DECL(randomx_riscv64_spad_store): + /* store integer registers */ + sd x16, 0(x27) + sd x17, 8(x27) + sd x18, 16(x27) + sd x19, 24(x27) + sd x20, 32(x27) + sd x21, 40(x27) + sd x22, 48(x27) + sd x23, 56(x27) + /* XOR and store f0,e0 */ + fmv.x.d x8, f0 + fmv.x.d x9, f8 + fmv.x.d x30, f1 + fmv.x.d x31, f9 + xor x8, x8, x9 + xor x30, x30, x31 + sd x8, 0(x26) + fmv.d.x f0, x8 + sd x30, 8(x26) + fmv.d.x f1, x30 + /* XOR and store f1,e1 */ + fmv.x.d x8, f2 + fmv.x.d x9, f10 + fmv.x.d x30, f3 + fmv.x.d x31, f11 + xor x8, x8, x9 + xor x30, x30, x31 + sd x8, 16(x26) + fmv.d.x f2, x8 + sd x30, 24(x26) + fmv.d.x f3, x30 + /* XOR and store f2,e2 */ + fmv.x.d x8, f4 + fmv.x.d x9, f12 + fmv.x.d x30, f5 + fmv.x.d x31, f13 + xor x8, x8, x9 + xor x30, x30, x31 + sd x8, 32(x26) + fmv.d.x f4, x8 + sd x30, 40(x26) + fmv.d.x f5, x30 + /* XOR and store f3,e3 */ + fmv.x.d x8, f6 + fmv.x.d x9, f14 + fmv.x.d x30, f7 + fmv.x.d x31, f15 + xor x8, x8, x9 + xor x30, x30, x31 + sd x8, 48(x26) + fmv.d.x f6, x8 + sd x30, 56(x26) + fmv.d.x f7, x30 + +DECL(randomx_riscv64_spad_store_hardaes): + nop /* not implemented */ + +DECL(randomx_riscv64_spad_store_softaes): + /* store integer registers */ + sd x16, 0(x27) + sd x17, 8(x27) + sd x18, 16(x27) + sd x19, 24(x27) + sd x20, 32(x27) + sd x21, 40(x27) + sd x22, 48(x27) + sd x23, 56(x27) + /* process f0 with 4 AES rounds */ + fmv.x.d x8, f8 + fmv.x.d x10, f9 + fmv.x.d x30, f0 + fmv.x.d x31, f1 + jal softaes_enc + fmv.x.d x8, f10 + fmv.x.d x10, f11 + jal softaes_enc + fmv.x.d x8, f12 + fmv.x.d x10, f13 + jal softaes_enc + fmv.x.d x8, f14 + fmv.x.d x10, f15 + jal softaes_enc + sd x30, 0(x26) + fmv.d.x f0, x30 + sd x31, 8(x26) + fmv.d.x f1, x31 + /* process f1 with 4 AES rounds */ + fmv.x.d x8, f8 + fmv.x.d x10, f9 + fmv.x.d x30, f2 + fmv.x.d x31, f3 + jal softaes_dec + fmv.x.d x8, f10 + fmv.x.d x10, f11 + jal softaes_dec + fmv.x.d x8, f12 + fmv.x.d x10, f13 + jal softaes_dec + fmv.x.d x8, f14 + fmv.x.d x10, f15 + jal softaes_dec + sd x30, 16(x26) + fmv.d.x f2, x30 + sd x31, 24(x26) + fmv.d.x f3, x31 + /* process f2 with 4 AES rounds */ + fmv.x.d x8, f8 + fmv.x.d x10, f9 + fmv.x.d x30, f4 + fmv.x.d x31, f5 + jal softaes_enc + fmv.x.d x8, f10 + fmv.x.d x10, f11 + jal softaes_enc + fmv.x.d x8, f12 + fmv.x.d x10, f13 + jal softaes_enc + fmv.x.d x8, f14 + fmv.x.d x10, f15 + jal softaes_enc + sd x30, 32(x26) + fmv.d.x f4, x30 + sd x31, 40(x26) + fmv.d.x f5, x31 + /* process f3 with 4 AES rounds */ + fmv.x.d x8, f8 + fmv.x.d x10, f9 + fmv.x.d x30, f6 + fmv.x.d x31, f7 + jal softaes_dec + fmv.x.d x8, f10 + fmv.x.d x10, f11 + jal softaes_dec + fmv.x.d x8, f12 + fmv.x.d x10, f13 + jal softaes_dec + fmv.x.d x8, f14 + fmv.x.d x10, f15 + jal softaes_dec + sd x30, 48(x26) + fmv.d.x f6, x30 + sd x31, 56(x26) + fmv.d.x f7, x31 + /* restore clobbered registers */ + lwu x10, 80(x3) + lwu x11, 84(x3) + ld x12, 96(x3) + ld x13, 104(x3) + ld x14, 112(x3) + ld x15, 120(x3) + +DECL(randomx_riscv64_loop_end): + xor x26, x16, x18 /* JIT compiler will adjust the registers */ + /* load L3 mask */ + lwu x1, 88(x3) + addi x24, x24, -1 + srli x27, x26, 32 + /* set spAddr0, spAddr1 */ + and x26, x26, x1 + and x27, x27, x1 + add x26, x26, x5 + add x27, x27, x5 + /* align L3 mask */ + addi x1, x1, 56 + /* conditional branch doesn't have sufficient range */ + j condition_check +DECL(randomx_riscv64_fix_continue_loop): +continue_loop: + .word 0 /* JIT compiler will write a jump to loop_begin */ +condition_check: + bnez x24, continue_loop + +DECL(randomx_riscv64_epilogue): + /* restore callee saved registers */ + ld x10, 8(sp) + ld x1, 0(sp) + ld x3, 16(sp) + ld x8, 32(sp) + ld x9, 40(sp) + ld x24, 96(sp) + ld x25, 104(sp) + ld x26, 112(sp) + ld x27, 120(sp) + fld f18, 144(sp) + fld f19, 152(sp) + fld f20, 160(sp) + fld f21, 168(sp) + fld f22, 176(sp) + fld f23, 184(sp) + fld f24, 192(sp) + fld f25, 200(sp) + fld f26, 208(sp) + fld f27, 216(sp) + /* save VM registers */ + sd x16, 0(x10) + sd x17, 8(x10) + sd x18, 16(x10) + sd x19, 24(x10) + sd x20, 32(x10) + sd x21, 40(x10) + sd x22, 48(x10) + sd x23, 56(x10) + fsd f0, 64(x10) + fsd f1, 72(x10) + fsd f2, 80(x10) + fsd f3, 88(x10) + fsd f4, 96(x10) + fsd f5, 104(x10) + fsd f6, 112(x10) + fsd f7, 120(x10) + fsd f8, 128(x10) + fsd f9, 136(x10) + fsd f10, 144(x10) + fsd f11, 152(x10) + fsd f12, 160(x10) + fsd f13, 168(x10) + fsd f14, 176(x10) + fsd f15, 184(x10) + /* restore callee saved registers */ + ld x18, 48(sp) + ld x19, 56(sp) + ld x20, 64(sp) + ld x21, 72(sp) + ld x22, 80(sp) + ld x23, 88(sp) + fld f8, 128(sp) + fld f9, 136(sp) + /* restore stack pointer */ + addi sp, sp, 224 + /* return */ + ret + +/* + Soft AES subroutines + in: + x3 = literal pool + x8, x10 = round key + x30, x31 = plaintext + out: + x30, x31 = ciphertext + clobbers: + x8-x11 (limbs) + x12-x13 (LUTs) + x14-x15 (temp) +*/ +DECL(randomx_riscv64_softaes): +softaes_enc: + /* enc. lookup table */ + ld x13, 128(x3) + + /* load the round key into x8, x9, x10, x11 */ + srli x9, x8, 32 + srli x11, x10, 32 +#ifdef __riscv_zba + zext.w x8, x8 + zext.w x10, x10 +#else + slli x8, x8, 32 + slli x10, x10, 32 + srli x8, x8, 32 + srli x10, x10, 32 +#endif + + /* byte 0 */ + andi x14, x30, 255 + srli x30, x30, 8 + addi x12, x13, -2048 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, -2048(x14) + + /* byte 1 */ + andi x15, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x8, x8, x14 + + /* byte 2 */ + andi x14, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x11, x11, x15 + + /* byte 3 */ + andi x15, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x10, x10, x14 + + /* byte 4 */ + andi x14, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x9, x9, x15 + + /* byte 5 */ + andi x15, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x9, x9, x14 + + /* byte 6 */ + andi x14, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x8, x8, x15 + + /* byte 7 */ + andi x15, x30, 255 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x11, x11, x14 + + /* byte 8 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x10, x10, x15 + + /* byte 9 */ + andi x15, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x10, x10, x14 + + /* byte 10 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x9, x9, x15 + + /* byte 11 */ + andi x15, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x8, x8, x14 + + /* byte 12 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x11, x11, x15 + + /* byte 13 */ + andi x15, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x11, x11, x14 + + /* byte 14 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x10, x10, x15 + + /* byte 15 */ + andi x15, x31, 255 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x9, x9, x14 + + slli x11, x11, 32 + slli x9, x9, 32 + or x30, x8, x9 + or x31, x10, x11 + xor x30, x30, x15 + + ret + +softaes_dec: + /* dec. lookup table */ + ld x13, 136(x3) + + /* load the round key into x8, x9, x10, x11 */ + srli x9, x8, 32 + srli x11, x10, 32 +#ifdef __riscv_zba + zext.w x8, x8 + zext.w x10, x10 +#else + slli x8, x8, 32 + slli x10, x10, 32 + srli x8, x8, 32 + srli x10, x10, 32 +#endif + + /* byte 0 */ + andi x14, x30, 255 + srli x30, x30, 8 + addi x12, x13, -2048 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, -2048(x14) + + /* byte 1 */ + andi x15, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x8, x8, x14 + + /* byte 2 */ + andi x14, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x9, x9, x15 + + /* byte 3 */ + andi x15, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x10, x10, x14 + + /* byte 4 */ + andi x14, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x11, x11, x15 + + /* byte 5 */ + andi x15, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x9, x9, x14 + + /* byte 6 */ + andi x14, x30, 255 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x10, x10, x15 + + /* byte 7 */ + andi x15, x30, 255 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x11, x11, x14 + + /* byte 8 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x8, x8, x15 + + /* byte 9 */ + andi x15, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x10, x10, x14 + + /* byte 10 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x11, x11, x15 + + /* byte 11 */ + andi x15, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x8, x8, x14 + + /* byte 12 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x9, x9, x15 + + /* byte 13 */ + andi x15, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x11, x11, x14 + + /* byte 14 */ + andi x14, x31, 255 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x8, x8, x15 + + /* byte 15 */ + andi x15, x31, 255 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x9, x9, x14 + + slli x11, x11, 32 + slli x9, x9, 32 + or x30, x8, x9 + or x31, x10, x11 + xor x31, x31, x15 + + ret + +DECL(randomx_riscv64_program_end): + nop + + +/* literal pool for SuperscalarHash */ + /* space for remaining IMUL_RCP literals */ +ssh_literal_pool: + /* space for 256 IMUL_RCP literals */ + .fill 256,8,0 + +/* + SuperscalarHash subroutine + in: + x3 = literal pool + x6 = cache + x7 = itemNumber + out: + x8-x15 = 64-byte hash + clobbers: + x7, x28-x31 +*/ +DECL(randomx_riscv64_ssh_init): +superscalar_hash: + ld x30, 0(x3) /* superscalarMul0 */ + addi x8, x7, 1 + ld x9, 8(x3) + li x31, RANDOMX_CACHE_MASK + ld x10, 16(x3) + ld x11, 24(x3) + mul x8, x8, x30 + ld x12, 32(x3) + ld x13, 40(x3) + lla x30, ssh_literal_pool + ld x14, 48(x3) + and x7, x7, x31 + ld x15, 56(x3) + slli x7, x7, 6 + xor x9, x9, x8 + add x7, x7, x6 + xor x10, x10, x8 + /* load the first IMUL_RCP literal */ + ld x31, 2040(x30) + xor x11, x11, x8 + xor x12, x12, x8 + xor x13, x13, x8 + xor x14, x14, x8 + xor x15, x15, x8 + +DECL(randomx_riscv64_ssh_load): + ld x28, 0(x7) + ld x29, 8(x7) + xor x8, x8, x28 + ld x28, 16(x7) + xor x9, x9, x29 + ld x29, 24(x7) + xor x10, x10, x28 + ld x28, 32(x7) + xor x11, x11, x29 + ld x29, 40(x7) + xor x12, x12, x28 + ld x28, 48(x7) + xor x13, x13, x29 + ld x29, 56(x7) + xor x14, x14, x28 + li x7, RANDOMX_CACHE_MASK + xor x15, x15, x29 + +DECL(randomx_riscv64_ssh_prefetch): + and x7, x8, x7 /* JIT compiler will adjust the register */ + slli x7, x7, 6 + add x7, x7, x6 + /* prefetch - doesn't seem to have any effect */ + /* ld x0, 0(x7) */ + +DECL(randomx_riscv64_ssh_end): + nop diff --git a/src/RandomX/src/jit_compiler_rv64_static.hpp b/src/RandomX/src/jit_compiler_rv64_static.hpp new file mode 100644 index 000000000..656623c74 --- /dev/null +++ b/src/RandomX/src/jit_compiler_rv64_static.hpp @@ -0,0 +1,53 @@ +/* +Copyright (c) 2023 tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +extern "C" { + void randomx_riscv64_literals(); + void randomx_riscv64_literals_end(); + void randomx_riscv64_data_init(); + void randomx_riscv64_fix_data_call(); + void randomx_riscv64_prologue(); + void randomx_riscv64_loop_begin(); + void randomx_riscv64_data_read(); + void randomx_riscv64_data_read_light(); + void randomx_riscv64_fix_loop_call(); + void randomx_riscv64_spad_store(); + void randomx_riscv64_spad_store_hardaes(); + void randomx_riscv64_spad_store_softaes(); + void randomx_riscv64_loop_end(); + void randomx_riscv64_fix_continue_loop(); + void randomx_riscv64_epilogue(); + void randomx_riscv64_softaes(); + void randomx_riscv64_program_end(); + void randomx_riscv64_ssh_init(); + void randomx_riscv64_ssh_load(); + void randomx_riscv64_ssh_prefetch(); + void randomx_riscv64_ssh_end(); +} diff --git a/src/RandomX/src/jit_compiler_x86.cpp b/src/RandomX/src/jit_compiler_x86.cpp index 5587e6afb..785ce5f59 100644 --- a/src/RandomX/src/jit_compiler_x86.cpp +++ b/src/RandomX/src/jit_compiler_x86.cpp @@ -34,7 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "superscalar.hpp" #include "program.hpp" #include "reciprocal.h" -#include "virtual_memory.hpp" +#include "virtual_memory.h" namespace randomx { /* @@ -618,7 +618,7 @@ namespace randomx { } void JitCompilerX86::h_IMUL_RCP(Instruction& instr, int i) { - uint64_t divisor = instr.getImm32(); + const uint32_t divisor = instr.getImm32(); if (!isZeroOrPowerOf2(divisor)) { registerUsage[instr.dst] = i; emit(MOV_RAX_I); diff --git a/src/RandomX/src/randomx.cpp b/src/RandomX/src/randomx.cpp index 7daaa46df..a08968e6a 100644 --- a/src/RandomX/src/randomx.cpp +++ b/src/RandomX/src/randomx.cpp @@ -36,7 +36,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cpu.hpp" #include #include + +#if defined(__SSE__) || defined(__SSE2__) || (defined(_M_IX86_FP) && (_M_IX86_FP > 0)) +#define USE_CSR_INTRINSICS +#include +#else #include +#endif extern "C" { @@ -356,8 +362,14 @@ extern "C" { assert(machine != nullptr); assert(inputSize == 0 || input != nullptr); assert(output != nullptr); + +#ifdef USE_CSR_INTRINSICS + const unsigned int fpstate = _mm_getcsr(); +#else fenv_t fpstate; fegetenv(&fpstate); +#endif + alignas(16) uint64_t tempHash[8]; int blakeResult = blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0); assert(blakeResult == 0); @@ -370,7 +382,12 @@ extern "C" { } machine->run(&tempHash); machine->getFinalResult(output, RANDOMX_HASH_SIZE); + +#ifdef USE_CSR_INTRINSICS + _mm_setcsr(fpstate); +#else fesetenv(&fpstate); +#endif } void randomx_calculate_hash_first(randomx_vm* machine, const void* input, size_t inputSize) { @@ -400,4 +417,15 @@ extern "C" { machine->run(machine->tempHash); machine->getFinalResult(output, RANDOMX_HASH_SIZE); } + + void randomx_calculate_commitment(const void* input, size_t inputSize, const void* hash_in, void* com_out) { + assert(inputSize == 0 || input != nullptr); + assert(hash_in != nullptr); + assert(com_out != nullptr); + blake2b_state state; + blake2b_init(&state, RANDOMX_HASH_SIZE); + blake2b_update(&state, input, inputSize); + blake2b_update(&state, hash_in, RANDOMX_HASH_SIZE); + blake2b_final(&state, com_out, RANDOMX_HASH_SIZE); + } } diff --git a/src/RandomX/src/randomx.h b/src/RandomX/src/randomx.h index 64d18068b..313bcd2e0 100644 --- a/src/RandomX/src/randomx.h +++ b/src/RandomX/src/randomx.h @@ -260,6 +260,17 @@ RANDOMX_EXPORT void randomx_calculate_hash_first(randomx_vm* machine, const void RANDOMX_EXPORT void randomx_calculate_hash_next(randomx_vm* machine, const void* nextInput, size_t nextInputSize, void* output); RANDOMX_EXPORT void randomx_calculate_hash_last(randomx_vm* machine, void* output); +/** + * Calculate a RandomX commitment from a RandomX hash and its input. + * + * @param input is a pointer to memory that was hashed. Must not be NULL. + * @param inputSize is the number of bytes in the input. + * @param hash_in is the output from randomx_calculate_hash* (RANDOMX_HASH_SIZE bytes). + * @param com_out is a pointer to memory where the commitment will be stored. Must not + * be NULL and at least RANDOMX_HASH_SIZE bytes must be available for writing. +*/ +RANDOMX_EXPORT void randomx_calculate_commitment(const void* input, size_t inputSize, const void* hash_in, void* com_out); + #if defined(__cplusplus) } #endif diff --git a/src/RandomX/src/reciprocal.c b/src/RandomX/src/reciprocal.c index 22620f53a..074d1846b 100644 --- a/src/RandomX/src/reciprocal.c +++ b/src/RandomX/src/reciprocal.c @@ -44,36 +44,28 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ret */ -uint64_t randomx_reciprocal(uint64_t divisor) { +uint64_t randomx_reciprocal(uint32_t divisor) { assert(divisor != 0); const uint64_t p2exp63 = 1ULL << 63; + const uint64_t q = p2exp63 / divisor; + const uint64_t r = p2exp63 % divisor; + +#ifdef __GNUC__ + const uint32_t shift = 64 - __builtin_clzll(divisor); +#else + uint32_t shift = 32; + for (uint32_t k = 1U << 31; (k & divisor) == 0; k >>= 1) + --shift; +#endif - uint64_t quotient = p2exp63 / divisor, remainder = p2exp63 % divisor; - - unsigned bsr = 0; //highest set bit in divisor - - for (uint64_t bit = divisor; bit > 0; bit >>= 1) - bsr++; - - for (unsigned shift = 0; shift < bsr; shift++) { - if (remainder >= divisor - remainder) { - quotient = quotient * 2 + 1; - remainder = remainder * 2 - divisor; - } - else { - quotient = quotient * 2; - remainder = remainder * 2; - } - } - - return quotient; + return (q << shift) + ((r << shift) / divisor); } #if !RANDOMX_HAVE_FAST_RECIPROCAL -uint64_t randomx_reciprocal_fast(uint64_t divisor) { +uint64_t randomx_reciprocal_fast(uint32_t divisor) { return randomx_reciprocal(divisor); } diff --git a/src/RandomX/src/reciprocal.h b/src/RandomX/src/reciprocal.h index 8858df2b8..90bd9b6be 100644 --- a/src/RandomX/src/reciprocal.h +++ b/src/RandomX/src/reciprocal.h @@ -40,8 +40,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. extern "C" { #endif -uint64_t randomx_reciprocal(uint64_t); -uint64_t randomx_reciprocal_fast(uint64_t); +uint64_t randomx_reciprocal(uint32_t); +uint64_t randomx_reciprocal_fast(uint32_t); #if defined(__cplusplus) } diff --git a/src/RandomX/src/tests/benchmark.cpp b/src/RandomX/src/tests/benchmark.cpp index 36b0259b6..148521a51 100644 --- a/src/RandomX/src/tests/benchmark.cpp +++ b/src/RandomX/src/tests/benchmark.cpp @@ -96,6 +96,7 @@ void printUsage(const char* executable) { std::cout << " --avx2 use optimized Argon2 for AVX2 CPUs" << std::endl; std::cout << " --auto select the best options for the current CPU" << std::endl; std::cout << " --noBatch calculate hashes one by one (default: batch)" << std::endl; + std::cout << " --commit calculate commitments instead of hashes (default: hashes)" << std::endl; } struct MemoryException : public std::exception { @@ -113,7 +114,7 @@ struct DatasetAllocException : public MemoryException { using MineFunc = void(randomx_vm * vm, std::atomic & atomicNonce, AtomicHash & result, uint32_t noncesCount, int thread, int cpuid); -template +template void mine(randomx_vm* vm, std::atomic& atomicNonce, AtomicHash& result, uint32_t noncesCount, int thread, int cpuid = -1) { if (cpuid >= 0) { int rc = set_thread_affinity(cpuid); @@ -138,6 +139,9 @@ void mine(randomx_vm* vm, std::atomic& atomicNonce, AtomicHash& result } store32(noncePtr, nonce); (batch ? randomx_calculate_hash_next : randomx_calculate_hash)(vm, blockTemplate, sizeof(blockTemplate), &hash); + if (commit) { + randomx_calculate_commitment(blockTemplate, sizeof(blockTemplate), &hash, &hash); + } result.xorWith(hash); if (!batch) { nonce = atomicNonce.fetch_add(1); @@ -146,7 +150,7 @@ void mine(randomx_vm* vm, std::atomic& atomicNonce, AtomicHash& result } int main(int argc, char** argv) { - bool softAes, miningMode, verificationMode, help, largePages, jit, secure; + bool softAes, miningMode, verificationMode, help, largePages, jit, secure, commit; bool ssse3, avx2, autoFlags, noBatch; int noncesCount, threadCount, initThreadCount; uint64_t threadAffinity; @@ -172,10 +176,11 @@ int main(int argc, char** argv) { readOption("--avx2", argc, argv, avx2); readOption("--auto", argc, argv, autoFlags); readOption("--noBatch", argc, argv, noBatch); + readOption("--commit", argc, argv, commit); store32(&seed, seedValue); - std::cout << "RandomX benchmark v1.1.11" << std::endl; + std::cout << "RandomX benchmark v1.2.1" << std::endl; if (help) { printUsage(argv[0]); @@ -280,11 +285,24 @@ int main(int argc, char** argv) { MineFunc* func; if (noBatch) { - func = &mine; + if (commit) { + std::cout << " - hash commitments" << std::endl; + func = &mine; + } + else { + func = &mine; + } } else { - func = &mine; - std::cout << " - batch mode" << std::endl; + if (commit) { + //TODO: support batch mode with commitments + std::cout << " - hash commitments" << std::endl; + func = &mine; + } + else { + std::cout << " - batch mode" << std::endl; + func = &mine; + } } std::cout << "Initializing"; @@ -376,7 +394,7 @@ int main(int argc, char** argv) { randomx_release_cache(cache); std::cout << "Calculated result: "; result.print(std::cout); - if (noncesCount == 1000 && seedValue == 0) + if (noncesCount == 1000 && seedValue == 0 && !commit) std::cout << "Reference result: 10b649a3f15c7c7f88277812f2e74b337a0f20ce909af09199cccb960771cfa1" << std::endl; if (!miningMode) { std::cout << "Performance: " << 1000 * elapsed / noncesCount << " ms per hash" << std::endl; diff --git a/src/RandomX/src/tests/perf-simulation.cpp b/src/RandomX/src/tests/perf-simulation.cpp index 1068a40ef..27f34d8c4 100644 --- a/src/RandomX/src/tests/perf-simulation.cpp +++ b/src/RandomX/src/tests/perf-simulation.cpp @@ -477,7 +477,7 @@ int analyze(randomx::Program& p) { } if (opcode < randomx::ceil_IMUL_RCP) { - uint64_t divisor = instr.getImm32(); + const uint32_t divisor = instr.getImm32(); if (!randomx::isZeroOrPowerOf2(divisor)) { instr.dst = instr.dst % randomx::RegistersCount; instr.opcode |= DST_INT; diff --git a/src/RandomX/src/tests/riscv64_zba.s b/src/RandomX/src/tests/riscv64_zba.s new file mode 100644 index 000000000..e1947e7a6 --- /dev/null +++ b/src/RandomX/src/tests/riscv64_zba.s @@ -0,0 +1,9 @@ +/* RISC-V - test if the Zba extension is present */ + +.text +.global main + +main: + sh1add x6, x6, x7 + li x10, 0 + ret diff --git a/src/RandomX/src/tests/riscv64_zbb.s b/src/RandomX/src/tests/riscv64_zbb.s new file mode 100644 index 000000000..d922043f0 --- /dev/null +++ b/src/RandomX/src/tests/riscv64_zbb.s @@ -0,0 +1,9 @@ +/* RISC-V - test if the Zbb extension is present */ + +.text +.global main + +main: + ror x6, x6, x7 + li x10, 0 + ret diff --git a/src/RandomX/src/tests/tests.cpp b/src/RandomX/src/tests/tests.cpp index 412585b1d..5e1b41a38 100644 --- a/src/RandomX/src/tests/tests.cpp +++ b/src/RandomX/src/tests/tests.cpp @@ -34,6 +34,14 @@ void calcStringHash(const char(&key)[K], const char(&input)[H], void* output) { randomx_calculate_hash(vm, input, H - 1, output); } +template +void calcStringCommitment(const char(&key)[K], const char(&input)[H], void* output) { + initCache(key); + assert(vm != nullptr); + randomx_calculate_hash(vm, input, H - 1, output); + randomx_calculate_commitment(input, H - 1, output, output); +} + template void calcHexHash(const char(&key)[K], const char(&hex)[H], void* output) { initCache(key); @@ -1082,6 +1090,22 @@ int main() { assert(rx_get_rounding_mode() == RoundToNearest); }); + if (RANDOMX_HAVE_COMPILER) { + randomx_destroy_vm(vm); + vm = nullptr; +#ifdef RANDOMX_FORCE_SECURE + vm = randomx_create_vm(RANDOMX_FLAG_DEFAULT | RANDOMX_FLAG_SECURE, cache, nullptr); +#else + vm = randomx_create_vm(RANDOMX_FLAG_DEFAULT, cache, nullptr); +#endif + } + + runTest("Commitment test", stringsEqual(RANDOMX_ARGON_SALT, "RandomX\x03"), []() { + char hash[RANDOMX_HASH_SIZE]; + calcStringCommitment("test key 000", "This is a test", &hash); + assert(equalsHex(hash, "d53ccf348b75291b7be76f0a7ac8208bbced734b912f6fca60539ab6f86be919")); + }); + randomx_destroy_vm(vm); vm = nullptr; diff --git a/src/RandomX/src/virtual_memory.cpp b/src/RandomX/src/virtual_memory.c similarity index 54% rename from src/RandomX/src/virtual_memory.cpp rename to src/RandomX/src/virtual_memory.c index 248d3a2c4..d2cdcda0f 100644 --- a/src/RandomX/src/virtual_memory.cpp +++ b/src/RandomX/src/virtual_memory.c @@ -26,28 +26,24 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "virtual_memory.hpp" - -#include - #if defined(_WIN32) || defined(__CYGWIN__) #include #else +#define _GNU_SOURCE 1 /* needed for MAP_ANONYMOUS on older platforms */ #ifdef __APPLE__ #include #include #include # if TARGET_OS_OSX -# if TARGET_CPU_ARM64 -# define USE_PTHREAD_JIT_WP 1 -# else -# undef USE_PTHREAD_JIT_WP -# endif +# define USE_PTHREAD_JIT_WP 1 # include +# include +# include # endif #endif #include #include +#include #ifndef MAP_ANONYMOUS #define MAP_ANONYMOUS MAP_ANON #endif @@ -57,27 +53,50 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define PAGE_EXECUTE_READWRITE (PROT_READ | PROT_WRITE | PROT_EXEC) #endif -#if defined(_WIN32) || defined(__CYGWIN__) -std::string getErrorMessage(const char* function) { - LPSTR messageBuffer = nullptr; - size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, - NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&messageBuffer, 0, NULL); - std::string message(messageBuffer, size); - LocalFree(messageBuffer); - return std::string(function) + std::string(": ") + message; +#include "virtual_memory.h" + +#if defined(USE_PTHREAD_JIT_WP) && defined(MAC_OS_VERSION_11_0) \ + && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_VERSION_11_0 +static int MacOSchecked, MacOSver; +/* This function is used implicitly by clang's __builtin_available() checker. + * When cross-compiling, the library containing this function doesn't exist, + * and linking will fail because the symbol is unresolved. The function here + * is a quick and dirty hack to get close enough to identify MacOSX 11.0. + */ +static int32_t __isOSVersionAtLeast(int32_t major, int32_t minor, int32_t subminor) { + if (!MacOSchecked) { + struct utsname ut; + int mmaj, mmin; + uname(&ut); + sscanf(ut.release, "%d.%d", &mmaj, &mmin); + // The utsname release version is 9 greater than the canonical OS version + mmaj -= 9; + MacOSver = (mmaj << 8) | mmin; + MacOSchecked = 1; + } + return MacOSver >= ((major << 8) | minor); } +#endif + -void setPrivilege(const char* pszPrivilege, BOOL bEnable) { +#if defined(_WIN32) || defined(__CYGWIN__) +#define Fail(func) do {*errfunc = func; return GetLastError();} while(0) +int setPrivilege(const char* pszPrivilege, BOOL bEnable, char **errfunc) { HANDLE hToken; TOKEN_PRIVILEGES tp; BOOL status; - DWORD error; + DWORD error = 0; + + *errfunc = NULL; if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hToken)) - throw std::runtime_error(getErrorMessage("OpenProcessToken")); + Fail("OpenProcessToken"); - if (!LookupPrivilegeValue(NULL, pszPrivilege, &tp.Privileges[0].Luid)) - throw std::runtime_error(getErrorMessage("LookupPrivilegeValue")); + if (!LookupPrivilegeValue(NULL, pszPrivilege, &tp.Privileges[0].Luid)) { + *errfunc = "LookupPrivilegeValue"; + error = GetLastError(); + goto out; + } tp.PrivilegeCount = 1; @@ -89,20 +108,28 @@ void setPrivilege(const char* pszPrivilege, BOOL bEnable) { status = AdjustTokenPrivileges(hToken, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0); error = GetLastError(); - if (!status || (error != ERROR_SUCCESS)) - throw std::runtime_error(getErrorMessage("AdjustTokenPrivileges")); + if (!status || (error != ERROR_SUCCESS)) { + *errfunc = "AdjustTokenPrivileges"; + goto out; + } - if (!CloseHandle(hToken)) - throw std::runtime_error(getErrorMessage("CloseHandle")); +out: + if (!CloseHandle(hToken)) { + if (*errfunc == NULL) { + *errfunc = "CloseHandle"; + error = GetLastError(); + } + } + return error; } +#else +#define Fail(func) do {*errfunc = func; return errno;} while(0) #endif -void* allocMemoryPages(std::size_t bytes) { +void* allocMemoryPages(size_t bytes) { void* mem; #if defined(_WIN32) || defined(__CYGWIN__) - mem = VirtualAlloc(nullptr, bytes, MEM_COMMIT, PAGE_READWRITE); - if (mem == nullptr) - throw std::runtime_error(getErrorMessage("allocMemoryPages - VirtualAlloc")); + mem = VirtualAlloc(NULL, bytes, MEM_COMMIT, PAGE_READWRITE); #else #if defined(__NetBSD__) #define RESERVED_FLAGS PROT_MPROTECT(PROT_EXEC) @@ -116,89 +143,95 @@ void* allocMemoryPages(std::size_t bytes) { #define MEXTRA 0 #define PEXTRA 0 #endif - mem = mmap(nullptr, bytes, PAGE_READWRITE | RESERVED_FLAGS | PEXTRA, MAP_ANONYMOUS | MAP_PRIVATE | MEXTRA, -1, 0); + mem = mmap(NULL, bytes, PAGE_READWRITE | RESERVED_FLAGS | PEXTRA, MAP_ANONYMOUS | MAP_PRIVATE | MEXTRA, -1, 0); if (mem == MAP_FAILED) - throw std::runtime_error("allocMemoryPages - mmap failed"); + mem = NULL; #if defined(USE_PTHREAD_JIT_WP) && defined(MAC_OS_VERSION_11_0) \ && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_VERSION_11_0 if (__builtin_available(macOS 11.0, *)) { - pthread_jit_write_protect_np(false); + pthread_jit_write_protect_np(0); } #endif #endif return mem; } -static inline void pageProtect(void* ptr, std::size_t bytes, int rules) { +static inline int pageProtect(void* ptr, size_t bytes, int rules, char **errfunc) { #if defined(_WIN32) || defined(__CYGWIN__) DWORD oldp; if (!VirtualProtect(ptr, bytes, (DWORD)rules, &oldp)) { - throw std::runtime_error(getErrorMessage("VirtualProtect")); + Fail("VirtualProtect"); } #else if (-1 == mprotect(ptr, bytes, rules)) - throw std::runtime_error("mprotect failed"); + Fail("mprotect"); #endif + return 0; } -void setPagesRW(void* ptr, std::size_t bytes) { +void setPagesRW(void* ptr, size_t bytes) { + char *errfunc; #if defined(USE_PTHREAD_JIT_WP) && defined(MAC_OS_VERSION_11_0) \ && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_VERSION_11_0 if (__builtin_available(macOS 11.0, *)) { - pthread_jit_write_protect_np(false); + pthread_jit_write_protect_np(0); } else { - pageProtect(ptr, bytes, PAGE_READWRITE); + pageProtect(ptr, bytes, PAGE_READWRITE, &errfunc); } #else - pageProtect(ptr, bytes, PAGE_READWRITE); + pageProtect(ptr, bytes, PAGE_READWRITE, &errfunc); #endif } -void setPagesRX(void* ptr, std::size_t bytes) { +void setPagesRX(void* ptr, size_t bytes) { + char *errfunc; #if defined(USE_PTHREAD_JIT_WP) && defined(MAC_OS_VERSION_11_0) \ && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_VERSION_11_0 if (__builtin_available(macOS 11.0, *)) { - pthread_jit_write_protect_np(true); + pthread_jit_write_protect_np(1); + __builtin___clear_cache((char*)ptr, ((char*)ptr) + bytes); } else { - pageProtect(ptr, bytes, PAGE_EXECUTE_READ); + pageProtect(ptr, bytes, PAGE_EXECUTE_READ, &errfunc); } #else - pageProtect(ptr, bytes, PAGE_EXECUTE_READ); + pageProtect(ptr, bytes, PAGE_EXECUTE_READ, &errfunc); #endif } -void setPagesRWX(void* ptr, std::size_t bytes) { - pageProtect(ptr, bytes, PAGE_EXECUTE_READWRITE); +void setPagesRWX(void* ptr, size_t bytes) { + char *errfunc; + pageProtect(ptr, bytes, PAGE_EXECUTE_READWRITE, &errfunc); } -void* allocLargePagesMemory(std::size_t bytes) { +void* allocLargePagesMemory(size_t bytes) { void* mem; + char *errfunc; #if defined(_WIN32) || defined(__CYGWIN__) - setPrivilege("SeLockMemoryPrivilege", 1); - auto pageMinimum = GetLargePageMinimum(); - if (pageMinimum > 0) - mem = VirtualAlloc(NULL, alignSize(bytes, pageMinimum), MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES, PAGE_READWRITE); - else - throw std::runtime_error("allocLargePagesMemory - Large pages are not supported"); - if (mem == nullptr) - throw std::runtime_error(getErrorMessage("allocLargePagesMemory - VirtualAlloc")); + if (setPrivilege("SeLockMemoryPrivilege", 1, &errfunc)) + return NULL; + size_t pageMinimum = GetLargePageMinimum(); + if (!pageMinimum) { + errfunc = "No large pages"; + return NULL; + } + mem = VirtualAlloc(NULL, alignSize(bytes, pageMinimum), MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES, PAGE_READWRITE); #else #ifdef __APPLE__ - mem = mmap(nullptr, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0); + mem = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0); #elif defined(__FreeBSD__) - mem = mmap(nullptr, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER, -1, 0); + mem = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER, -1, 0); #elif defined(__OpenBSD__) || defined(__NetBSD__) mem = MAP_FAILED; // OpenBSD does not support huge pages #else - mem = mmap(nullptr, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, -1, 0); + mem = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, -1, 0); #endif if (mem == MAP_FAILED) - throw std::runtime_error("allocLargePagesMemory - mmap failed"); + mem = NULL; #endif return mem; } -void freePagedMemory(void* ptr, std::size_t bytes) { +void freePagedMemory(void* ptr, size_t bytes) { #if defined(_WIN32) || defined(__CYGWIN__) VirtualFree(ptr, 0, MEM_RELEASE); #else diff --git a/src/RandomX/src/virtual_memory.hpp b/src/RandomX/src/virtual_memory.h similarity index 80% rename from src/RandomX/src/virtual_memory.hpp rename to src/RandomX/src/virtual_memory.h index 9e8bc29ab..5e8e31d53 100644 --- a/src/RandomX/src/virtual_memory.hpp +++ b/src/RandomX/src/virtual_memory.h @@ -28,15 +28,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma once -#include +#ifdef __cplusplus +extern "C" { +#endif -constexpr std::size_t alignSize(std::size_t pos, std::size_t align) { - return ((pos - 1) / align + 1) * align; -} +#include + +#define alignSize(pos, align) (((pos - 1) / align + 1) * align) -void* allocMemoryPages(std::size_t); -void setPagesRW(void*, std::size_t); -void setPagesRX(void*, std::size_t); -void setPagesRWX(void*, std::size_t); -void* allocLargePagesMemory(std::size_t); -void freePagedMemory(void*, std::size_t); +void* allocMemoryPages(size_t); +void setPagesRW(void*, size_t); +void setPagesRX(void*, size_t); +void setPagesRWX(void*, size_t); +void* allocLargePagesMemory(size_t); +void freePagedMemory(void*, size_t); + +#ifdef __cplusplus +} +#endif diff --git a/src/RandomX/vcxproj/randomx-dll.vcxproj b/src/RandomX/vcxproj/randomx-dll.vcxproj index 8b8ea8c08..4eaae9bed 100644 --- a/src/RandomX/vcxproj/randomx-dll.vcxproj +++ b/src/RandomX/vcxproj/randomx-dll.vcxproj @@ -43,7 +43,7 @@ - + @@ -74,7 +74,7 @@ - + diff --git a/src/RandomX/vcxproj/randomx-dll.vcxproj.filters b/src/RandomX/vcxproj/randomx-dll.vcxproj.filters index 68e1b8559..5b51f9f72 100644 --- a/src/RandomX/vcxproj/randomx-dll.vcxproj.filters +++ b/src/RandomX/vcxproj/randomx-dll.vcxproj.filters @@ -87,7 +87,7 @@ Header Files - + Header Files @@ -151,7 +151,7 @@ Source Files - + Source Files diff --git a/src/RandomX/vcxproj/randomx.vcxproj b/src/RandomX/vcxproj/randomx.vcxproj index e0625c88b..cefdc8fb3 100644 --- a/src/RandomX/vcxproj/randomx.vcxproj +++ b/src/RandomX/vcxproj/randomx.vcxproj @@ -156,7 +156,7 @@ SET ERRORLEVEL = 0 - + @@ -198,7 +198,7 @@ SET ERRORLEVEL = 0 - + diff --git a/src/RandomX/vcxproj/randomx.vcxproj.filters b/src/RandomX/vcxproj/randomx.vcxproj.filters index eb4462a59..7f055b5b8 100644 --- a/src/RandomX/vcxproj/randomx.vcxproj.filters +++ b/src/RandomX/vcxproj/randomx.vcxproj.filters @@ -72,7 +72,7 @@ Source Files - + Source Files @@ -164,7 +164,7 @@ Header Files - + Header Files