Browse Source

cn/r ASM support for --av 1.

classic-dev
XMRig 5 years ago
parent
commit
c5cbd9d8fe
  1. 12
      algo/cryptonight/cryptonight.c
  2. 10
      algo/cryptonight/cryptonight.h
  3. 42
      algo/cryptonight/cryptonight_r_av1.c
  4. 1
      algo/cryptonight/variant4_random_math.h
  5. 16
      cmake/asm.cmake
  6. 146
      crypto/CryptonightR_gen.c
  7. 279
      crypto/asm/CryptonightR_soft_aes_template.inc
  8. 1593
      crypto/asm/CryptonightR_template.S
  9. 1060
      crypto/asm/CryptonightR_template.h
  10. 529
      crypto/asm/CryptonightR_template.inc
  11. 36
      crypto/asm/cn_main_loop.asm
  12. 19
      memory.c
  13. 14
      persistent_memory.h
  14. 24
      unix/memory_unix.c
  15. 27
      win/memory_win.c

12
algo/cryptonight/cryptonight.c

@ -39,9 +39,10 @@
#include "crypto/c_groestl.h"
#include "crypto/c_jh.h"
#include "crypto/c_skein.h"
#include "cryptonight.h"
#include "cryptonight_test.h"
#include "cryptonight.h"
#include "options.h"
#include "persistent_memory.h"
static cn_hash_fun asm_func_map[AV_MAX][VARIANT_MAX][ASM_MAX] = {};
@ -83,6 +84,9 @@ void cryptonight_single_hash_asm_intel(const uint8_t *input, size_t size, uint8_
void cryptonight_single_hash_asm_ryzen(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
void cryptonight_single_hash_asm_bulldozer(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
void cryptonight_double_hash_asm(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
void cryptonight_r_av1_asm_intel(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
void cryptonight_r_av1_asm_bulldozer(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx);
#endif
@ -150,6 +154,8 @@ static bool self_test() {
for (size_t i = 0; i < count; ++i) {
ctx[i] = _mm_malloc(sizeof(struct cryptonight_ctx), 16);
ctx[i]->memory = _mm_malloc(size, 16);
init_cn_r(ctx[i]);
}
if (opt_algo == ALGO_CRYPTONIGHT) {
@ -288,6 +294,10 @@ bool cryptonight_init(int av)
asm_func_map[AV_DOUBLE][VARIANT_2][ASM_INTEL] = cryptonight_double_hash_asm;
asm_func_map[AV_DOUBLE][VARIANT_2][ASM_RYZEN] = cryptonight_double_hash_asm;
asm_func_map[AV_DOUBLE][VARIANT_2][ASM_BULLDOZER] = cryptonight_double_hash_asm;
asm_func_map[AV_SINGLE][VARIANT_4][ASM_INTEL] = cryptonight_r_av1_asm_intel;
asm_func_map[AV_SINGLE][VARIANT_4][ASM_RYZEN] = cryptonight_r_av1_asm_intel;
asm_func_map[AV_SINGLE][VARIANT_4][ASM_BULLDOZER] = cryptonight_r_av1_asm_bulldozer;
# endif
return self_test();

10
algo/cryptonight/cryptonight.h

@ -51,12 +51,6 @@ typedef void(*cn_mainloop_fun_ms_abi)(struct cryptonight_ctx*) ABI_ATTRIBUTE;
typedef void(*cn_mainloop_double_fun_ms_abi)(struct cryptonight_ctx*, struct cryptonight_ctx*) ABI_ATTRIBUTE;
struct cryptonight_r_data {
int variant;
uint64_t height;
};
struct cryptonight_ctx {
uint8_t state[224] __attribute__((aligned(16)));
uint8_t *memory __attribute__((aligned(16)));
@ -66,8 +60,8 @@ struct cryptonight_ctx {
cn_mainloop_fun_ms_abi generated_code;
cn_mainloop_double_fun_ms_abi generated_code_double;
struct cryptonight_r_data generated_code_data;
struct cryptonight_r_data generated_code_double_data;
uint64_t generated_code_height;
uint64_t generated_code_double_height;
uint64_t height;
};

42
algo/cryptonight/cryptonight_r_av1.c

@ -97,5 +97,47 @@ void cryptonight_r_av1(const uint8_t *restrict input, size_t size, uint8_t *rest
#ifndef XMRIG_NO_ASM
void v4_compile_code(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM);
void cryptonight_r_av1_asm_intel(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
{
if (ctx[0]->generated_code_height != ctx[0]->height) {
struct V4_Instruction code[256];
const int code_size = v4_random_math_init(code, ctx[0]->height);
v4_compile_code(code, code_size, (void*)(ctx[0]->generated_code), ASM_INTEL);
ctx[0]->generated_code_height = ctx[0]->height;
}
keccak(input, size, ctx[0]->state, 200);
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
ctx[0]->generated_code(ctx[0]);
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
keccakf((uint64_t*) ctx[0]->state, 24);
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
}
void cryptonight_r_av1_asm_bulldozer(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx)
{
if (ctx[0]->generated_code_height != ctx[0]->height) {
struct V4_Instruction code[256];
const int code_size = v4_random_math_init(code, ctx[0]->height);
v4_compile_code(code, code_size, (void*)(ctx[0]->generated_code), ASM_BULLDOZER);
ctx[0]->generated_code_height = ctx[0]->height;
}
keccak(input, size, ctx[0]->state, 200);
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory);
ctx[0]->generated_code(ctx[0]);
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state);
keccakf((uint64_t*) ctx[0]->state, 24);
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
}
#endif

1
algo/cryptonight/variant4_random_math.h

@ -4,6 +4,7 @@
#include <stdint.h>
#include <string.h>
#include <stdbool.h>
#include "crypto/c_blake256.h"

16
cmake/asm.cmake

@ -4,15 +4,21 @@ if (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8)
enable_language(ASM)
if (WIN32 AND CMAKE_C_COMPILER_ID MATCHES GNU)
set(XMRIG_ASM_FILE "crypto/asm/win64/cn_main_loop.S")
set(XMRIG_ASM_FILES
"crypto/asm/win64/cn_main_loop.S"
"crypto/asm/CryptonightR_template.S"
)
else()
set(XMRIG_ASM_FILE "crypto/asm/cn_main_loop.S")
set(XMRIG_ASM_FILES
"crypto/asm/cn_main_loop.S"
"crypto/asm/CryptonightR_template.S"
)
endif()
set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY C)
set_property(SOURCE ${XMRIG_ASM_FILES} PROPERTY C)
add_library(${XMRIG_ASM_LIBRARY} STATIC ${XMRIG_ASM_FILE})
set(XMRIG_ASM_SOURCES "")
add_library(${XMRIG_ASM_LIBRARY} STATIC ${XMRIG_ASM_FILES})
set(XMRIG_ASM_SOURCES "crypto/CryptonightR_gen.c")
set_property(TARGET ${XMRIG_ASM_LIBRARY} PROPERTY LINKER_LANGUAGE C)
else()
set(XMRIG_ASM_SOURCES "")

146
crypto/CryptonightR_gen.c

@ -0,0 +1,146 @@
/* XMRig
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
* Copyright 2012-2014 pooler <pooler@litecoinpool.org>
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
* Copyright 2018-2019 SChernykh <https://github.com/SChernykh>
* Copyright 2016-2019 XMRig <https://github.com/xmrig>, <support@xmrig.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <string.h>
#include "algo/cryptonight/cryptonight_monero.h"
#include "crypto/asm/CryptonightR_template.h"
#include "persistent_memory.h"
static inline void add_code(uint8_t **p, void (*p1)(), void (*p2)())
{
const ptrdiff_t size = (const uint8_t*)(p2) - (const uint8_t*)(p1);
if (size > 0) {
memcpy(*p, (const void *) p1, size);
*p += size;
}
}
static inline void add_random_math(uint8_t **p, const struct V4_Instruction* code, int code_size, const void_func* instructions, const void_func* instructions_mov, bool is_64_bit, enum Assembly ASM)
{
uint32_t prev_rot_src = (uint32_t)(-1);
for (int i = 0;; ++i) {
const struct V4_Instruction inst = code[i];
if (inst.opcode == RET) {
break;
}
uint8_t opcode = (inst.opcode == MUL) ? inst.opcode : (inst.opcode + 2);
uint8_t dst_index = inst.dst_index;
uint8_t src_index = inst.src_index;
const uint32_t a = inst.dst_index;
const uint32_t b = inst.src_index;
const uint8_t c = opcode | (dst_index << V4_OPCODE_BITS) | (((src_index == 8) ? dst_index : src_index) << (V4_OPCODE_BITS + V4_DST_INDEX_BITS));
switch (inst.opcode) {
case ROR:
case ROL:
if (b != prev_rot_src) {
prev_rot_src = b;
add_code(p, instructions_mov[c], instructions_mov[c + 1]);
}
break;
}
if (a == prev_rot_src) {
prev_rot_src = (uint32_t)(-1);
}
void_func begin = instructions[c];
if ((ASM = ASM_BULLDOZER) && (inst.opcode == MUL) && !is_64_bit) {
// AMD Bulldozer has latency 4 for 32-bit IMUL and 6 for 64-bit IMUL
// Always use 32-bit IMUL for AMD Bulldozer in 32-bit mode - skip prefix 0x48 and change 0x49 to 0x41
uint8_t* prefix = (uint8_t*) begin;
if (*prefix == 0x49) {
**p = 0x41;
*p += 1;
}
begin = (void_func)(prefix + 1);
}
add_code(p, begin, instructions[c + 1]);
if (inst.opcode == ADD) {
*(uint32_t*)(*p - sizeof(uint32_t) - (is_64_bit ? 3 : 0)) = inst.C;
if (is_64_bit) {
prev_rot_src = (uint32_t)(-1);
}
}
}
}
void v4_compile_code(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM)
{
uint8_t* p0 = machine_code;
uint8_t* p = p0;
add_code(&p, CryptonightR_template_part1, CryptonightR_template_part2);
add_random_math(&p, code, code_size, instructions, instructions_mov, false, ASM);
add_code(&p, CryptonightR_template_part2, CryptonightR_template_part3);
*(int*)(p - 4) = (int)((((const uint8_t*)CryptonightR_template_mainloop) - ((const uint8_t*)CryptonightR_template_part1)) - (p - p0));
add_code(&p, CryptonightR_template_part3, CryptonightR_template_end);
flush_instruction_cache(machine_code, p - p0);
}
void v4_compile_code_double(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM)
{
uint8_t* p0 = (uint8_t*) machine_code;
uint8_t* p = p0;
add_code(&p, CryptonightR_template_double_part1, CryptonightR_template_double_part2);
add_random_math(&p, code, code_size, instructions, instructions_mov, false, ASM);
add_code(&p, CryptonightR_template_double_part2, CryptonightR_template_double_part3);
add_random_math(&p, code, code_size, instructions, instructions_mov, false, ASM);
add_code(&p, CryptonightR_template_double_part3, CryptonightR_template_double_part4);
*(int*)(p - 4) = (int)((((const uint8_t*)CryptonightR_template_double_mainloop) - ((const uint8_t*)CryptonightR_template_double_part1)) - (p - p0));
add_code(&p, CryptonightR_template_double_part4, CryptonightR_template_double_end);
flush_instruction_cache(machine_code, p - p0);
}
void v4_soft_aes_compile_code(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM)
{
uint8_t* p0 = machine_code;
uint8_t* p = p0;
add_code(&p, CryptonightR_soft_aes_template_part1, CryptonightR_soft_aes_template_part2);
add_random_math(&p, code, code_size, instructions, instructions_mov, false, ASM);
add_code(&p, CryptonightR_soft_aes_template_part2, CryptonightR_soft_aes_template_part3);
*(int*)(p - 4) = (int)((((const uint8_t*)CryptonightR_soft_aes_template_mainloop) - ((const uint8_t*)CryptonightR_soft_aes_template_part1)) - (p - p0));
add_code(&p, CryptonightR_soft_aes_template_part3, CryptonightR_soft_aes_template_end);
flush_instruction_cache(machine_code, p - p0);
}

279
crypto/asm/CryptonightR_soft_aes_template.inc

@ -0,0 +1,279 @@
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part1)
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_mainloop)
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part2)
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part3)
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_end)
ALIGN(64)
FN_PREFIX(CryptonightR_soft_aes_template_part1):
mov QWORD PTR [rsp+8], rcx
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 232
mov eax, [rcx+96]
mov ebx, [rcx+100]
mov esi, [rcx+104]
mov edx, [rcx+108]
mov [rsp+144], eax
mov [rsp+148], ebx
mov [rsp+152], esi
mov [rsp+156], edx
mov rax, QWORD PTR [rcx+48]
mov r10, rcx
xor rax, QWORD PTR [rcx+16]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r9, QWORD PTR [rcx+40]
xor r9, QWORD PTR [rcx+8]
movq xmm4, rax
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r11, QWORD PTR [rcx+224]
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r10+72]
mov rax, QWORD PTR [r10+80]
movq xmm0, rdx
xor rax, QWORD PTR [r10+64]
movaps XMMWORD PTR [rsp+16], xmm6
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+48], xmm8
movaps XMMWORD PTR [rsp+64], xmm9
movaps XMMWORD PTR [rsp+80], xmm10
movaps XMMWORD PTR [rsp+96], xmm11
movaps XMMWORD PTR [rsp+112], xmm12
movaps XMMWORD PTR [rsp+128], xmm13
movq xmm5, rax
mov rax, r8
punpcklqdq xmm4, xmm0
and eax, 2097136
movq xmm10, QWORD PTR [r10+96]
movq xmm0, rcx
mov rcx, QWORD PTR [r10+104]
xorps xmm9, xmm9
mov QWORD PTR [rsp+328], rax
movq xmm12, r11
mov QWORD PTR [rsp+320], r9
punpcklqdq xmm5, xmm0
movq xmm13, rcx
mov r12d, 524288
ALIGN(64)
FN_PREFIX(CryptonightR_soft_aes_template_mainloop):
movd xmm11, r12d
mov r12, QWORD PTR [r10+272]
lea r13, QWORD PTR [rax+r11]
mov esi, DWORD PTR [r13]
movq xmm0, r9
mov r10d, DWORD PTR [r13+4]
movq xmm7, r8
mov ebp, DWORD PTR [r13+12]
mov r14d, DWORD PTR [r13+8]
mov rdx, QWORD PTR [rsp+328]
movzx ecx, sil
shr esi, 8
punpcklqdq xmm7, xmm0
mov r15d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
mov edi, DWORD PTR [r12+rcx*4]
movzx ecx, r14b
shr r14d, 8
mov ebx, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
shr ebp, 8
mov r9d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
xor r15d, DWORD PTR [r12+rcx*4+1024]
movzx ecx, r14b
shr r14d, 8
mov eax, r14d
shr eax, 8
xor edi, DWORD PTR [r12+rcx*4+1024]
add eax, 256
movzx ecx, bpl
shr ebp, 8
xor ebx, DWORD PTR [r12+rcx*4+1024]
movzx ecx, sil
shr esi, 8
xor r9d, DWORD PTR [r12+rcx*4+1024]
add r12, 2048
movzx ecx, r10b
shr r10d, 8
add r10d, 256
mov r11d, DWORD PTR [r12+rax*4]
xor r11d, DWORD PTR [r12+rcx*4]
xor r11d, r9d
movzx ecx, sil
mov r10d, DWORD PTR [r12+r10*4]
shr esi, 8
add esi, 256
xor r10d, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
xor r10d, ebx
shr ebp, 8
movd xmm1, r11d
add ebp, 256
movq r11, xmm12
mov r9d, DWORD PTR [r12+rcx*4]
xor r9d, DWORD PTR [r12+rsi*4]
mov eax, DWORD PTR [r12+rbp*4]
xor r9d, edi
movzx ecx, r14b
movd xmm0, r10d
movd xmm2, r9d
xor eax, DWORD PTR [r12+rcx*4]
mov rcx, rdx
xor eax, r15d
punpckldq xmm2, xmm1
xor rcx, 16
movd xmm6, eax
mov rax, rdx
punpckldq xmm6, xmm0
xor rax, 32
punpckldq xmm6, xmm2
xor rdx, 48
movdqu xmm2, XMMWORD PTR [rcx+r11]
pxor xmm6, xmm2
pxor xmm6, xmm7
paddq xmm2, xmm4
movdqu xmm1, XMMWORD PTR [rax+r11]
movdqu xmm0, XMMWORD PTR [rdx+r11]
pxor xmm6, xmm1
pxor xmm6, xmm0
paddq xmm0, xmm5
movdqu XMMWORD PTR [rcx+r11], xmm0
movdqu XMMWORD PTR [rax+r11], xmm2
movq rcx, xmm13
paddq xmm1, xmm7
movdqu XMMWORD PTR [rdx+r11], xmm1
movq rdi, xmm6
mov r10, rdi
and r10d, 2097136
movdqa xmm0, xmm6
pxor xmm0, xmm4
movdqu XMMWORD PTR [r13], xmm0
mov ebx, [rsp+144]
mov ebp, [rsp+152]
add ebx, [rsp+148]
add ebp, [rsp+156]
shl rbp, 32
or rbx, rbp
xor rbx, QWORD PTR [r10+r11]
lea r14, QWORD PTR [r10+r11]
mov rbp, QWORD PTR [r14+8]
mov [rsp+160], rbx
mov [rsp+168], rdi
mov [rsp+176], rbp
mov [rsp+184], r10
mov r10, rsp
mov ebx, [rsp+144]
mov esi, [rsp+148]
mov edi, [rsp+152]
mov ebp, [rsp+156]
movd esp, xmm7
movaps xmm0, xmm7
psrldq xmm0, 8
movd r15d, xmm0
movd eax, xmm4
movd edx, xmm5
movaps xmm0, xmm5
psrldq xmm0, 8
movd r9d, xmm0
FN_PREFIX(CryptonightR_soft_aes_template_part2):
mov rsp, r10
mov [rsp+144], ebx
mov [rsp+148], esi
mov [rsp+152], edi
mov [rsp+156], ebp
mov edi, edi
shl rbp, 32
or rbp, rdi
xor r8, rbp
mov ebx, ebx
shl rsi, 32
or rsi, rbx
xor QWORD PTR [rsp+320], rsi
mov rbx, [rsp+160]
mov rdi, [rsp+168]
mov rbp, [rsp+176]
mov r10, [rsp+184]
mov r9, r10
xor r9, 16
mov rcx, r10
xor rcx, 32
xor r10, 48
mov rax, rbx
mul rdi
movdqu xmm2, XMMWORD PTR [r9+r11]
movdqu xmm1, XMMWORD PTR [rcx+r11]
pxor xmm6, xmm2
pxor xmm6, xmm1
paddq xmm1, xmm7
add r8, rdx
movdqu xmm0, XMMWORD PTR [r10+r11]
pxor xmm6, xmm0
paddq xmm0, xmm5
paddq xmm2, xmm4
movdqu XMMWORD PTR [r9+r11], xmm0
movdqa xmm5, xmm4
mov r9, QWORD PTR [rsp+320]
movdqa xmm4, xmm6
add r9, rax
movdqu XMMWORD PTR [rcx+r11], xmm2
movdqu XMMWORD PTR [r10+r11], xmm1
mov r10, QWORD PTR [rsp+304]
movd r12d, xmm11
mov QWORD PTR [r14], r8
xor r8, rbx
mov rax, r8
mov QWORD PTR [r14+8], r9
and eax, 2097136
xor r9, rbp
mov QWORD PTR [rsp+320], r9
mov QWORD PTR [rsp+328], rax
sub r12d, 1
jne FN_PREFIX(CryptonightR_soft_aes_template_mainloop)
FN_PREFIX(CryptonightR_soft_aes_template_part3):
movaps xmm6, XMMWORD PTR [rsp+16]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+48]
movaps xmm9, XMMWORD PTR [rsp+64]
movaps xmm10, XMMWORD PTR [rsp+80]
movaps xmm11, XMMWORD PTR [rsp+96]
movaps xmm12, XMMWORD PTR [rsp+112]
movaps xmm13, XMMWORD PTR [rsp+128]
add rsp, 232
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx
ret
FN_PREFIX(CryptonightR_soft_aes_template_end):

1593
crypto/asm/CryptonightR_template.S

File diff suppressed because it is too large

1060
crypto/asm/CryptonightR_template.h

File diff suppressed because it is too large

529
crypto/asm/CryptonightR_template.inc

@ -0,0 +1,529 @@
PUBLIC FN_PREFIX(CryptonightR_template_part1)
PUBLIC FN_PREFIX(CryptonightR_template_mainloop)
PUBLIC FN_PREFIX(CryptonightR_template_part2)
PUBLIC FN_PREFIX(CryptonightR_template_part3)
PUBLIC FN_PREFIX(CryptonightR_template_end)
PUBLIC FN_PREFIX(CryptonightR_template_double_part1)
PUBLIC FN_PREFIX(CryptonightR_template_double_mainloop)
PUBLIC FN_PREFIX(CryptonightR_template_double_part2)
PUBLIC FN_PREFIX(CryptonightR_template_double_part3)
PUBLIC FN_PREFIX(CryptonightR_template_double_part4)
PUBLIC FN_PREFIX(CryptonightR_template_double_end)
ALIGN(64)
FN_PREFIX(CryptonightR_template_part1):
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push r10
push r11
push r12
push r13
push r14
push r15
push rdi
sub rsp, 64
mov r12, rcx
mov r8, QWORD PTR [r12+32]
mov rdx, r12
xor r8, QWORD PTR [r12]
mov r15, QWORD PTR [r12+40]
mov r9, r8
xor r15, QWORD PTR [r12+8]
mov r11, QWORD PTR [r12+224]
mov r12, QWORD PTR [r12+56]
xor r12, QWORD PTR [rdx+24]
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
movaps XMMWORD PTR [rsp+48], xmm6
movq xmm0, r12
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
movaps XMMWORD PTR [rsp], xmm9
mov r12, QWORD PTR [rdx+88]
xor r12, QWORD PTR [rdx+72]
movq xmm6, rax
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm6, xmm0
and r9d, 2097136
movq xmm0, r12
movq xmm7, rax
punpcklqdq xmm7, xmm0
mov r10d, r9d
movq xmm9, rsp
mov rsp, r8
mov r8d, 524288
mov ebx, [rdx+96]
mov esi, [rdx+100]
mov edi, [rdx+104]
mov ebp, [rdx+108]
ALIGN(64)
FN_PREFIX(CryptonightR_template_mainloop):
movdqa xmm5, XMMWORD PTR [r9+r11]
movq xmm0, r15
movq xmm4, rsp
punpcklqdq xmm4, xmm0
lea rdx, QWORD PTR [r9+r11]
aesenc xmm5, xmm4
mov r12d, r9d
mov eax, r9d
xor r9d, 48
xor r12d, 16
xor eax, 32
movdqu xmm0, XMMWORD PTR [r9+r11]
movaps xmm3, xmm0
movdqu xmm2, XMMWORD PTR [r12+r11]
movdqu xmm1, XMMWORD PTR [rax+r11]
pxor xmm0, xmm2
pxor xmm5, xmm1
pxor xmm5, xmm0
paddq xmm3, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r12+r11], xmm3
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movq r12, xmm5
movd r10d, xmm5
and r10d, 2097136
movdqa xmm0, xmm5
pxor xmm0, xmm6
movdqu XMMWORD PTR [rdx], xmm0
lea r13d, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or r13, rdx
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
movd eax, xmm6
movd edx, xmm7
pextrd r9d, xmm7, 2
FN_PREFIX(CryptonightR_template_part2):
mov eax, edi
mov edx, ebp
shl rdx, 32
or rax, rdx
xor rsp, rax
mov eax, ebx
mov edx, esi
shl rdx, 32
or rax, rdx
xor r15, rax
mov rax, r13
mul r12
mov r9d, r10d
mov r12d, r10d
xor r9d, 16
xor r12d, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [r12+r11]
movaps xmm3, xmm1
movdqa xmm2, XMMWORD PTR [r9+r11]
movdqa xmm0, XMMWORD PTR [r10+r11]
pxor xmm1, xmm2
pxor xmm5, xmm0
pxor xmm5, xmm1
paddq xmm3, xmm4
paddq xmm2, xmm6
paddq xmm0, xmm7
movdqu XMMWORD PTR [r9+r11], xmm0
movdqu XMMWORD PTR [r12+r11], xmm2
movdqu XMMWORD PTR [r10+r11], xmm3
movdqa xmm7, xmm6
add r15, rax
add rsp, rdx
xor r10, 48
mov QWORD PTR [r10+r11], rsp
xor rsp, r13
mov r9d, esp
mov QWORD PTR [r10+r11+8], r15
and r9d, 2097136
xor r15, r14
movdqa xmm6, xmm5
dec r8d
jnz FN_PREFIX(CryptonightR_template_mainloop)
FN_PREFIX(CryptonightR_template_part3):
movq rsp, xmm9
mov rbx, QWORD PTR [rsp+136]
mov rbp, QWORD PTR [rsp+144]
mov rsi, QWORD PTR [rsp+152]
movaps xmm6, XMMWORD PTR [rsp+48]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+16]
movaps xmm9, XMMWORD PTR [rsp]
add rsp, 64
pop rdi
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
ret 0
FN_PREFIX(CryptonightR_template_end):
ALIGN(64)
FN_PREFIX(CryptonightR_template_double_part1):
mov QWORD PTR [rsp+24], rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 320
mov r14, QWORD PTR [rcx+32]
mov r8, rcx
xor r14, QWORD PTR [rcx]
mov r12, QWORD PTR [rcx+40]
mov ebx, r14d
mov rsi, QWORD PTR [rcx+224]
and ebx, 2097136
xor r12, QWORD PTR [rcx+8]
mov rcx, QWORD PTR [rcx+56]
xor rcx, QWORD PTR [r8+24]
mov rax, QWORD PTR [r8+48]
xor rax, QWORD PTR [r8+16]
mov r15, QWORD PTR [rdx+32]
xor r15, QWORD PTR [rdx]
movq xmm0, rcx
mov rcx, QWORD PTR [r8+88]
xor rcx, QWORD PTR [r8+72]
mov r13, QWORD PTR [rdx+40]
mov rdi, QWORD PTR [rdx+224]
xor r13, QWORD PTR [rdx+8]
movaps XMMWORD PTR [rsp+160], xmm6
movaps XMMWORD PTR [rsp+176], xmm7
movaps XMMWORD PTR [rsp+192], xmm8
movaps XMMWORD PTR [rsp+208], xmm9
movaps XMMWORD PTR [rsp+224], xmm10
movaps XMMWORD PTR [rsp+240], xmm11
movaps XMMWORD PTR [rsp+256], xmm12
movaps XMMWORD PTR [rsp+272], xmm13
movaps XMMWORD PTR [rsp+288], xmm14
movaps XMMWORD PTR [rsp+304], xmm15
movq xmm7, rax
mov rax, QWORD PTR [r8+80]
xor rax, QWORD PTR [r8+64]
movaps xmm1, XMMWORD PTR [rdx+96]
movaps xmm2, XMMWORD PTR [r8+96]
movaps XMMWORD PTR [rsp], xmm1
movaps XMMWORD PTR [rsp+16], xmm2
mov r8d, r15d
punpcklqdq xmm7, xmm0
movq xmm0, rcx
mov rcx, QWORD PTR [rdx+56]
xor rcx, QWORD PTR [rdx+24]
movq xmm9, rax
mov QWORD PTR [rsp+128], rsi
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
punpcklqdq xmm9, xmm0
movq xmm0, rcx
mov rcx, QWORD PTR [rdx+88]
xor rcx, QWORD PTR [rdx+72]
movq xmm8, rax
mov QWORD PTR [rsp+136], rdi
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm8, xmm0
and r8d, 2097136
movq xmm0, rcx
mov r11d, 524288
movq xmm10, rax
punpcklqdq xmm10, xmm0
movq xmm14, QWORD PTR [rsp+128]
movq xmm15, QWORD PTR [rsp+136]
ALIGN(64)
FN_PREFIX(CryptonightR_template_double_mainloop):
movdqu xmm6, XMMWORD PTR [rbx+rsi]
movq xmm0, r12
mov ecx, ebx
movq xmm3, r14
punpcklqdq xmm3, xmm0
xor ebx, 16
aesenc xmm6, xmm3
movq xmm4, r15
movdqu xmm0, XMMWORD PTR [rbx+rsi]
pxor xmm6, xmm0
xor ebx, 48
paddq xmm0, xmm7
movdqu xmm1, XMMWORD PTR [rbx+rsi]
pxor xmm6, xmm1
movdqu XMMWORD PTR [rbx+rsi], xmm0
paddq xmm1, xmm3
xor ebx, 16
mov eax, ebx
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbx+rsi]
pxor xmm6, xmm0
movq rdx, xmm6
movdqu XMMWORD PTR [rbx+rsi], xmm1
paddq xmm0, xmm9
movdqu XMMWORD PTR [rax+rsi], xmm0
movdqa xmm0, xmm6
pxor xmm0, xmm7
movdqu XMMWORD PTR [rcx+rsi], xmm0
mov esi, edx
movdqu xmm5, XMMWORD PTR [r8+rdi]
and esi, 2097136
mov ecx, r8d
movq xmm0, r13
punpcklqdq xmm4, xmm0
xor r8d, 16
aesenc xmm5, xmm4
movdqu xmm0, XMMWORD PTR [r8+rdi]
pxor xmm5, xmm0
xor r8d, 48
paddq xmm0, xmm8
movdqu xmm1, XMMWORD PTR [r8+rdi]
pxor xmm5, xmm1
movdqu XMMWORD PTR [r8+rdi], xmm0
paddq xmm1, xmm4
xor r8d, 16
mov eax, r8d
xor rax, 32
movdqu xmm0, XMMWORD PTR [r8+rdi]
pxor xmm5, xmm0
movdqu XMMWORD PTR [r8+rdi], xmm1
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rdi], xmm0
movdqa xmm0, xmm5
pxor xmm0, xmm8
movdqu XMMWORD PTR [rcx+rdi], xmm0
movq rdi, xmm5
movq rcx, xmm14
mov ebp, edi
mov r8, QWORD PTR [rcx+rsi]
mov r10, QWORD PTR [rcx+rsi+8]
lea r9, QWORD PTR [rcx+rsi]
xor esi, 16
movq xmm0, rsp
movq xmm1, rsi
movq xmm2, rdi
movq xmm11, rbp
movq xmm12, r15
movq xmm13, rdx
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp+16]
mov esi, DWORD PTR [rsp+20]
mov edi, DWORD PTR [rsp+24]
mov ebp, DWORD PTR [rsp+28]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movd esp, xmm3
pextrd r15d, xmm3, 2
movd eax, xmm7
movd edx, xmm9
pextrd r9d, xmm9, 2
FN_PREFIX(CryptonightR_template_double_part2):
mov eax, edi
mov edx, ebp
shl rdx, 32
or rax, rdx
xor r14, rax
mov eax, ebx
mov edx, esi
shl rdx, 32
or rax, rdx
xor r12, rax
movq rsp, xmm0
mov DWORD PTR [rsp+16], ebx
mov DWORD PTR [rsp+20], esi
mov DWORD PTR [rsp+24], edi
mov DWORD PTR [rsp+28], ebp
movq rsi, xmm1
movq rdi, xmm2
movq rbp, xmm11
movq r15, xmm12
movq rdx, xmm13
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rbx, r8
mov rax, r8
mul rdx
and ebp, 2097136
mov r8, rax
movdqu xmm1, XMMWORD PTR [rcx+rsi]
pxor xmm6, xmm1
xor esi, 48
paddq xmm1, xmm7
movdqu xmm2, XMMWORD PTR [rsi+rcx]
pxor xmm6, xmm2
paddq xmm2, xmm3
movdqu XMMWORD PTR [rsi+rcx], xmm1
xor esi, 16
mov eax, esi
mov rsi, rcx
movdqu xmm0, XMMWORD PTR [rax+rcx]
pxor xmm6, xmm0
movdqu XMMWORD PTR [rax+rcx], xmm2
paddq xmm0, xmm9
add r12, r8
xor rax, 32
add r14, rdx
movdqa xmm9, xmm7
movdqa xmm7, xmm6
movdqu XMMWORD PTR [rax+rcx], xmm0
mov QWORD PTR [r9+8], r12
xor r12, r10
mov QWORD PTR [r9], r14
movq rcx, xmm15
xor r14, rbx
mov r10d, ebp
mov ebx, r14d
xor ebp, 16
and ebx, 2097136
mov r8, QWORD PTR [r10+rcx]
mov r9, QWORD PTR [r10+rcx+8]
movq xmm0, rsp
movq xmm1, rbx
movq xmm2, rsi
movq xmm11, rdi
movq xmm12, rbp
movq xmm13, r15
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp]
mov esi, DWORD PTR [rsp+4]
mov edi, DWORD PTR [rsp+8]
mov ebp, DWORD PTR [rsp+12]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movq xmm3, r8
movd esp, xmm4
pextrd r15d, xmm4, 2
movd eax, xmm8
movd edx, xmm10
pextrd r9d, xmm10, 2
FN_PREFIX(CryptonightR_template_double_part3):
movq r15, xmm13
mov eax, edi
mov edx, ebp
shl rdx, 32
or rax, rdx
xor r15, rax
mov eax, ebx
mov edx, esi
shl rdx, 32
or rax, rdx
xor r13, rax
movq rsp, xmm0
mov DWORD PTR [rsp], ebx
mov DWORD PTR [rsp+4], esi
mov DWORD PTR [rsp+8], edi
mov DWORD PTR [rsp+12], ebp
movq rbx, xmm1
movq rsi, xmm2
movq rdi, xmm11
movq rbp, xmm12
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rax, r8
mul rdi
mov rdi, rcx
mov r8, rax
movdqu xmm1, XMMWORD PTR [rbp+rcx]
pxor xmm5, xmm1
xor ebp, 48
paddq xmm1, xmm8
add r13, r8
movdqu xmm2, XMMWORD PTR [rbp+rcx]
pxor xmm5, xmm2
add r15, rdx
movdqu XMMWORD PTR [rbp+rcx], xmm1
paddq xmm2, xmm4
xor ebp, 16
mov eax, ebp
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbp+rcx]
pxor xmm5, xmm0
movdqu XMMWORD PTR [rbp+rcx], xmm2
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rcx], xmm0
movq rax, xmm3
movdqa xmm10, xmm8
mov QWORD PTR [r10+rcx], r15
movdqa xmm8, xmm5
xor r15, rax
mov QWORD PTR [r10+rcx+8], r13
mov r8d, r15d
xor r13, r9
and r8d, 2097136
dec r11d
jnz FN_PREFIX(CryptonightR_template_double_mainloop)
FN_PREFIX(CryptonightR_template_double_part4):
mov rbx, QWORD PTR [rsp+400]
movaps xmm6, XMMWORD PTR [rsp+160]
movaps xmm7, XMMWORD PTR [rsp+176]
movaps xmm8, XMMWORD PTR [rsp+192]
movaps xmm9, XMMWORD PTR [rsp+208]
movaps xmm10, XMMWORD PTR [rsp+224]
movaps xmm11, XMMWORD PTR [rsp+240]
movaps xmm12, XMMWORD PTR [rsp+256]
movaps xmm13, XMMWORD PTR [rsp+272]
movaps xmm14, XMMWORD PTR [rsp+288]
movaps xmm15, XMMWORD PTR [rsp+304]
add rsp, 320
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
ret 0
FN_PREFIX(CryptonightR_template_double_end):

36
crypto/asm/cn_main_loop.asm

@ -1,36 +0,0 @@
_TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE
PUBLIC cnv2_mainloop_ivybridge_asm
PUBLIC cnv2_mainloop_ryzen_asm
PUBLIC cnv2_mainloop_bulldozer_asm
PUBLIC cnv2_double_mainloop_sandybridge_asm
ALIGN(64)
cnv2_mainloop_ivybridge_asm PROC
INCLUDE cn2/cnv2_main_loop_ivybridge.inc
ret 0
mov eax, 3735929054
cnv2_mainloop_ivybridge_asm ENDP
ALIGN(64)
cnv2_mainloop_ryzen_asm PROC
INCLUDE cn2/cnv2_main_loop_ryzen.inc
ret 0
mov eax, 3735929054
cnv2_mainloop_ryzen_asm ENDP
ALIGN(64)
cnv2_mainloop_bulldozer_asm PROC
INCLUDE cn2/cnv2_main_loop_bulldozer.inc
ret 0
mov eax, 3735929054
cnv2_mainloop_bulldozer_asm ENDP
ALIGN(64)
cnv2_double_mainloop_sandybridge_asm PROC
INCLUDE cn2/cnv2_double_main_loop_sandybridge.inc
ret 0
mov eax, 3735929054
cnv2_double_mainloop_sandybridge_asm ENDP
_TEXT_CNV2_MAINLOOP ENDS
END

19
memory.c

@ -4,7 +4,9 @@
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
* Copyright 2016-2018 XMRig <support@xmrig.com>
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
* Copyright 2018-2019 SChernykh <https://github.com/SChernykh>
* Copyright 2016-2019 XMRig <https://github.com/xmrig>, <support@xmrig.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@ -41,14 +43,29 @@ void * persistent_calloc(size_t num, size_t size) {
}
void init_cn_r(struct cryptonight_ctx *ctx)
{
uint8_t *p = allocate_executable_memory(0x4000);
ctx->generated_code = (cn_mainloop_fun_ms_abi) p;
ctx->generated_code_double = (cn_mainloop_double_fun_ms_abi)(p + 0x2000);
ctx->generated_code_height = ctx->generated_code_double_height = (uint64_t)(-1);
ctx->height = 0;
}
void create_cryptonight_ctx(struct cryptonight_ctx **ctx, int thr_id)
{
const int ratio = (opt_double_hash && opt_algo == ALGO_CRYPTONIGHT) ? 2 : 1;
ctx[0] = persistent_calloc(1, sizeof(struct cryptonight_ctx));
ctx[0]->memory = &persistent_memory[MEMORY * (thr_id * ratio + 1)];
init_cn_r(ctx[0]);
if (opt_double_hash) {
ctx[1] = persistent_calloc(1, sizeof(struct cryptonight_ctx));
ctx[1]->memory = ctx[0]->memory + (opt_algo == ALGO_CRYPTONIGHT ? MEMORY : MEMORY_LITE);
init_cn_r(ctx[1]);
}
}

14
persistent_memory.h

@ -4,8 +4,9 @@
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
* Copyright 2016-2017 XMRig <support@xmrig.com>
*
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
* Copyright 2018-2019 SChernykh <https://github.com/SChernykh>
* Copyright 2016-2019 XMRig <https://github.com/xmrig>, <support@xmrig.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@ -47,8 +48,15 @@ extern int persistent_memory_flags;
const char * persistent_memory_allocate();
void persistent_memory_free();
void * persistent_calloc(size_t num, size_t size);
void *persistent_calloc(size_t num, size_t size);
void create_cryptonight_ctx(struct cryptonight_ctx **ctx, int thr_id);
void *allocate_executable_memory(size_t size);
void flush_instruction_cache(void *p, size_t size);
void init_cn_r(struct cryptonight_ctx *ctx);
void protect_executable_memory(void *p, size_t size);
#endif /* XMRIG_PERSISTENT_MEMORY_H */

24
unix/memory_unix.c

@ -74,3 +74,27 @@ void persistent_memory_free() {
_mm_free(persistent_memory);
}
}
void *allocate_executable_memory(size_t size)
{
# if defined(__APPLE__)
return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANON, -1, 0);
# else
return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
# endif
}
void protect_executable_memory(void *p, size_t size)
{
mprotect(p, size, PROT_READ | PROT_EXEC);
}
void flush_instruction_cache(void *p, size_t size)
{
# ifndef __FreeBSD__
__builtin___clear_cache((char*) p, (char*)(p) + size);
# endif
}

27
win/memory_win.c

@ -4,8 +4,9 @@
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
* Copyright 2016-2017 XMRig <support@xmrig.com>
*
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
* Copyright 2018-2019 SChernykh <https://github.com/SChernykh>
* Copyright 2016-2019 XMRig <https://github.com/xmrig>, <support@xmrig.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@ -21,9 +22,6 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __MEMORY_H__
#define __MEMORY_H__
#include <windows.h>
#include <ntsecapi.h>
#include <tchar.h>
@ -172,4 +170,21 @@ void persistent_memory_free() {
}
}
#endif /* __MEMORY_H__ */
void *allocate_executable_memory(size_t size)
{
return VirtualAlloc(0, size, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE);
}
void protect_executable_memory(void *p, size_t size)
{
DWORD oldProtect;
VirtualProtect(p, size, PAGE_EXECUTE_READ, &oldProtect);
}
void flush_instruction_cache(void *p, size_t size)
{
FlushInstructionCache(GetCurrentProcess(), p, size);
}

Loading…
Cancel
Save