![xmrig@bunkr.org](/assets/img/avatar_default.png)
44 changed files with 2993 additions and 787 deletions
@ -0,0 +1,134 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
|||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
|||
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "algo/cryptonight/cryptonight.h" |
|||
#include "algo/cryptonight/cryptonight_monero.h" |
|||
#include "cryptonight_lite_softaes.h" |
|||
#include "crypto/c_keccak.h" |
|||
|
|||
|
|||
void cryptonight_lite_av3_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
|
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) { |
|||
__m128i cx; |
|||
cx = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]); |
|||
cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0)); |
|||
|
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx)); |
|||
idx0 = EXTRACT64(cx); |
|||
bx0 = cx; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0xFFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0xFFFF0])[1] = ah0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
|
|||
keccakf(h0, 24); |
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
} |
|||
|
|||
|
|||
void cryptonight_lite_av3_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
if (size < 43) { |
|||
memset(output, 0, 32); |
|||
return; |
|||
} |
|||
|
|||
keccak(input, size, ctx[0]->state, 200); |
|||
|
|||
VARIANT1_INIT(0); |
|||
|
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) { |
|||
__m128i cx; |
|||
cx = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]); |
|||
cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0)); |
|||
|
|||
cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx)); |
|||
|
|||
idx0 = EXTRACT64(cx); |
|||
bx0 = cx; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0xFFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0xFFFF0])[1] = ah0 ^ tweak1_2_0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
|
|||
keccakf(h0, 24); |
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
} |
@ -1,78 +0,0 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
|||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
|||
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "algo/cryptonight/cryptonight.h" |
|||
#include "cryptonight_lite_softaes.h" |
|||
#include "crypto/c_keccak.h" |
|||
|
|||
|
|||
void cryptonight_lite_av3_softaes(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx, uint8_t version) |
|||
{ |
|||
keccak((const uint8_t *) input, size, ctx->state0, 200); |
|||
|
|||
cn_explode_scratchpad((__m128i*) ctx->state0, (__m128i*) ctx->memory); |
|||
|
|||
const uint8_t* l0 = ctx->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx->state0; |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) { |
|||
__m128i cx; |
|||
cx = _mm_load_si128((__m128i *)&l0[idx0 & 0xFFFF0]); |
|||
cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0)); |
|||
|
|||
_mm_store_si128((__m128i *)&l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx)); |
|||
idx0 = EXTRACT64(cx); |
|||
bx0 = cx; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*)&l0[idx0 & 0xFFFF0])[0]; |
|||
ch = ((uint64_t*)&l0[idx0 & 0xFFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0xFFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0xFFFF0])[1] = ah0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx->memory, (__m128i*) ctx->state0); |
|||
|
|||
keccakf(h0, 24); |
|||
extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output); |
|||
} |
@ -0,0 +1,247 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
|||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
|||
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "crypto/c_keccak.h" |
|||
#include "cryptonight.h" |
|||
#include "cryptonight_aesni.h" |
|||
#include "cryptonight_monero.h" |
|||
|
|||
|
|||
void cryptonight_av1_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
|
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx; |
|||
cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); |
|||
|
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); |
|||
idx0 = EXTRACT64(cx); |
|||
bx0 = cx; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
|
|||
keccakf(h0, 24); |
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
} |
|||
|
|||
|
|||
void cryptonight_av1_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
if (size < 43) { |
|||
memset(output, 0, 32); |
|||
return; |
|||
} |
|||
|
|||
keccak(input, size, ctx[0]->state, 200); |
|||
|
|||
VARIANT1_INIT(0); |
|||
|
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx; |
|||
cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); |
|||
|
|||
cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); |
|||
|
|||
idx0 = EXTRACT64(cx); |
|||
bx0 = cx; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0 ^ tweak1_2_0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
|
|||
keccakf(h0, 24); |
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
} |
|||
|
|||
|
|||
void cryptonight_av1_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
|
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
|
|||
VARIANT2_INIT(0); |
|||
VARIANT2_SET_ROUNDING_MODE(); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); |
|||
|
|||
uint64_t idx0 = al0; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
const __m128i ax0 = _mm_set_epi64x(ah0, al0); |
|||
|
|||
cx = _mm_aesenc_si128(cx, ax0); |
|||
|
|||
VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1); |
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); |
|||
|
|||
idx0 = _mm_cvtsi128_si64(cx); |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
|
|||
VARIANT2_INTEGER_MATH(0, cl, cx); |
|||
lo = _umul128(idx0, cl, &hi); |
|||
VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, hi, lo); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; |
|||
|
|||
al0 ^= cl; |
|||
ah0 ^= ch; |
|||
idx0 = al0; |
|||
|
|||
bx1 = bx0; |
|||
bx0 = cx; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
|
|||
keccakf(h0, 24); |
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
} |
|||
|
|||
|
|||
#ifndef XMRIG_NO_ASM |
|||
extern void cnv2_mainloop_ivybridge_asm(struct cryptonight_ctx *ctx); |
|||
extern void cnv2_mainloop_ryzen_asm(struct cryptonight_ctx *ctx); |
|||
extern void cnv2_double_mainloop_sandybridge_asm(struct cryptonight_ctx* ctx0, struct cryptonight_ctx* ctx1); |
|||
|
|||
|
|||
void cryptonight_single_hash_asm_intel(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
|
|||
cnv2_mainloop_ivybridge_asm(ctx[0]); |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
keccakf((uint64_t*) ctx[0]->state, 24); |
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
} |
|||
|
|||
|
|||
void cryptonight_single_hash_asm_ryzen(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
|
|||
cnv2_mainloop_ryzen_asm(ctx[0]); |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
keccakf((uint64_t*) ctx[0]->state, 24); |
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
} |
|||
|
|||
|
|||
void cryptonight_double_hash_asm(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
keccak(input + size, size, ctx[1]->state, 200); |
|||
|
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
cn_explode_scratchpad((__m128i*) ctx[1]->state, (__m128i*) ctx[1]->memory); |
|||
|
|||
cnv2_double_mainloop_sandybridge_asm(ctx[0], ctx[1]); |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
cn_implode_scratchpad((__m128i*) ctx[1]->memory, (__m128i*) ctx[1]->state); |
|||
|
|||
keccakf((uint64_t*) ctx[0]->state, 24); |
|||
keccakf((uint64_t*) ctx[1]->state, 24); |
|||
|
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); |
|||
} |
|||
#endif |
@ -1,84 +0,0 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
|||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
|||
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "crypto/c_keccak.h" |
|||
#include "cryptonight.h" |
|||
#include "cryptonight_aesni.h" |
|||
#include "cryptonight_monero.h" |
|||
|
|||
|
|||
void cryptonight_av1_aesni(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx, uint8_t version) |
|||
{ |
|||
keccak((const uint8_t *) input, size, ctx->state0, 200); |
|||
|
|||
VARIANT1_INIT(0); |
|||
|
|||
cn_explode_scratchpad((__m128i*) ctx->state0, (__m128i*) ctx->memory); |
|||
|
|||
const uint8_t* l0 = ctx->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx->state0; |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx; |
|||
cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); |
|||
|
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); |
|||
VARIANT1_1(&l0[idx0 & 0x1FFFF0]); |
|||
idx0 = EXTRACT64(cx); |
|||
bx0 = cx; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
VARIANT1_2(ah0, 0); |
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; |
|||
VARIANT1_2(ah0, 0); |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx->memory, (__m128i*) ctx->state0); |
|||
|
|||
keccakf(h0, 24); |
|||
extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output); |
|||
} |
@ -0,0 +1,304 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
|||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
|||
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "crypto/c_keccak.h" |
|||
#include "cryptonight.h" |
|||
#include "cryptonight_aesni.h" |
|||
#include "cryptonight_monero.h" |
|||
|
|||
|
|||
void cryptonight_av2_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
keccak(input + size, size, ctx[1]->state, 200); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
const uint8_t* l1 = ctx[1]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
uint64_t* h1 = (uint64_t*) ctx[1]->state; |
|||
|
|||
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); |
|||
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t al1 = h1[0] ^ h1[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
uint64_t ah1 = h1[1] ^ h1[5]; |
|||
|
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
uint64_t idx1 = h1[0] ^ h1[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]); |
|||
|
|||
cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); |
|||
cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); |
|||
|
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0)); |
|||
_mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1)); |
|||
|
|||
idx0 = EXTRACT64(cx0); |
|||
idx1 = EXTRACT64(cx1); |
|||
|
|||
bx0 = cx0; |
|||
bx1 = cx1; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
|
|||
cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx1, cl, &hi); |
|||
|
|||
al1 += hi; |
|||
ah1 += lo; |
|||
|
|||
((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1; |
|||
((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1; |
|||
|
|||
ah1 ^= ch; |
|||
al1 ^= cl; |
|||
idx1 = al1; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); |
|||
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); |
|||
|
|||
keccakf(h0, 24); |
|||
keccakf(h1, 24); |
|||
|
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); |
|||
} |
|||
|
|||
|
|||
void cryptonight_av2_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
if (size < 43) { |
|||
memset(output, 0, 64); |
|||
return; |
|||
} |
|||
|
|||
keccak(input, size, ctx[0]->state, 200); |
|||
keccak(input + size, size, ctx[1]->state, 200); |
|||
|
|||
VARIANT1_INIT(0); |
|||
VARIANT1_INIT(1); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
const uint8_t* l1 = ctx[1]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
uint64_t* h1 = (uint64_t*) ctx[1]->state; |
|||
|
|||
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); |
|||
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t al1 = h1[0] ^ h1[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
uint64_t ah1 = h1[1] ^ h1[5]; |
|||
|
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
uint64_t idx1 = h1[0] ^ h1[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]); |
|||
|
|||
cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); |
|||
cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); |
|||
|
|||
cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0)); |
|||
cryptonight_monero_tweak((uint64_t*)&l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1)); |
|||
|
|||
idx0 = EXTRACT64(cx0); |
|||
idx1 = EXTRACT64(cx1); |
|||
|
|||
bx0 = cx0; |
|||
bx1 = cx1; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0 ^ tweak1_2_0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
|
|||
cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx1, cl, &hi); |
|||
|
|||
al1 += hi; |
|||
ah1 += lo; |
|||
|
|||
((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1; |
|||
((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1 ^ tweak1_2_1; |
|||
|
|||
ah1 ^= ch; |
|||
al1 ^= cl; |
|||
idx1 = al1; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); |
|||
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); |
|||
|
|||
keccakf(h0, 24); |
|||
keccakf(h1, 24); |
|||
|
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); |
|||
} |
|||
|
|||
|
|||
void cryptonight_av2_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
keccak(input + size, size, ctx[1]->state, 200); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
const uint8_t* l1 = ctx[1]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
uint64_t* h1 = (uint64_t*) ctx[1]->state; |
|||
|
|||
VARIANT2_INIT(0); |
|||
VARIANT2_INIT(1); |
|||
VARIANT2_SET_ROUNDING_MODE(); |
|||
|
|||
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); |
|||
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t al1 = h1[0] ^ h1[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
uint64_t ah1 = h1[1] ^ h1[5]; |
|||
|
|||
__m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx01 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); |
|||
__m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); |
|||
__m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); |
|||
|
|||
uint64_t idx0 = al0; |
|||
uint64_t idx1 = al1; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]); |
|||
|
|||
const __m128i ax0 = _mm_set_epi64x(ah0, al0); |
|||
const __m128i ax1 = _mm_set_epi64x(ah1, al1); |
|||
|
|||
cx0 = _mm_aesenc_si128(cx0, ax0); |
|||
cx1 = _mm_aesenc_si128(cx1, ax1); |
|||
|
|||
VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01); |
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx00, cx0)); |
|||
|
|||
VARIANT2_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11); |
|||
_mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx10, cx1)); |
|||
|
|||
idx0 = _mm_cvtsi128_si64(cx0); |
|||
idx1 = _mm_cvtsi128_si64(cx1); |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
|
|||
VARIANT2_INTEGER_MATH(0, cl, cx0); |
|||
lo = _umul128(idx0, cl, &hi); |
|||
VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, hi, lo); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; |
|||
|
|||
al0 ^= cl; |
|||
ah0 ^= ch; |
|||
idx0 = al0; |
|||
|
|||
cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1]; |
|||
|
|||
VARIANT2_INTEGER_MATH(1, cl, cx1); |
|||
lo = _umul128(idx1, cl, &hi); |
|||
VARIANT2_SHUFFLE2(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, hi, lo); |
|||
|
|||
al1 += hi; |
|||
ah1 += lo; |
|||
|
|||
((uint64_t*)&l1[idx1 & 0x1FFFF0])[0] = al1; |
|||
((uint64_t*)&l1[idx1 & 0x1FFFF0])[1] = ah1; |
|||
|
|||
al1 ^= cl; |
|||
ah1 ^= ch; |
|||
idx1 = al1; |
|||
|
|||
bx01 = bx00; |
|||
bx11 = bx10; |
|||
|
|||
bx00 = cx0; |
|||
bx10 = cx1; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); |
|||
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); |
|||
|
|||
keccakf(h0, 24); |
|||
keccakf(h1, 24); |
|||
|
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); |
|||
} |
@ -1,123 +0,0 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
|||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
|||
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "crypto/c_keccak.h" |
|||
#include "cryptonight.h" |
|||
#include "cryptonight_aesni.h" |
|||
#include "cryptonight_monero.h" |
|||
|
|||
|
|||
void cryptonight_av2_aesni_double(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx, uint8_t version) |
|||
{ |
|||
keccak((const uint8_t *) input, size, ctx->state0, 200); |
|||
keccak((const uint8_t *) input + size, size, ctx->state1, 200); |
|||
|
|||
VARIANT1_INIT(0); |
|||
VARIANT1_INIT(1); |
|||
|
|||
const uint8_t* l0 = ctx->memory; |
|||
const uint8_t* l1 = ctx->memory + MEMORY; |
|||
uint64_t* h0 = (uint64_t*) ctx->state0; |
|||
uint64_t* h1 = (uint64_t*) ctx->state1; |
|||
|
|||
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); |
|||
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t al1 = h1[0] ^ h1[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
uint64_t ah1 = h1[1] ^ h1[5]; |
|||
|
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
uint64_t idx1 = h1[0] ^ h1[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]); |
|||
|
|||
cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); |
|||
cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); |
|||
|
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0)); |
|||
_mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1)); |
|||
|
|||
VARIANT1_1(&l0[idx0 & 0x1FFFF0]); |
|||
VARIANT1_1(&l1[idx1 & 0x1FFFF0]); |
|||
|
|||
idx0 = EXTRACT64(cx0); |
|||
idx1 = EXTRACT64(cx1); |
|||
|
|||
bx0 = cx0; |
|||
bx1 = cx1; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
VARIANT1_2(ah0, 0); |
|||
((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0; |
|||
VARIANT1_2(ah0, 0); |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
|
|||
cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx1, cl, &hi); |
|||
|
|||
al1 += hi; |
|||
ah1 += lo; |
|||
|
|||
VARIANT1_2(ah1, 1); |
|||
((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1; |
|||
((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1; |
|||
VARIANT1_2(ah1, 1); |
|||
|
|||
ah1 ^= ch; |
|||
al1 ^= cl; |
|||
idx1 = al1; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); |
|||
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); |
|||
|
|||
keccakf(h0, 24); |
|||
keccakf(h1, 24); |
|||
|
|||
extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output); |
|||
extra_hashes[ctx->state1[0] & 3](ctx->state1, 200, (char*) output + 32); |
|||
} |
@ -0,0 +1,193 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
|||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
|||
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "crypto/c_keccak.h" |
|||
#include "cryptonight.h" |
|||
#include "cryptonight_monero.h" |
|||
#include "cryptonight_softaes.h" |
|||
|
|||
|
|||
void cryptonight_av3_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
|
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx; |
|||
cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0)); |
|||
|
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); |
|||
idx0 = EXTRACT64(cx); |
|||
bx0 = cx; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
|
|||
keccakf(h0, 24); |
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
} |
|||
|
|||
|
|||
void cryptonight_av3_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
if (size < 43) { |
|||
memset(output, 0, 32); |
|||
return; |
|||
} |
|||
|
|||
keccak(input, size, ctx[0]->state, 200); |
|||
|
|||
VARIANT1_INIT(0); |
|||
|
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx; |
|||
cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0)); |
|||
|
|||
cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); |
|||
|
|||
idx0 = EXTRACT64(cx); |
|||
bx0 = cx; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0 ^ tweak1_2_0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
|
|||
keccakf(h0, 24); |
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
} |
|||
|
|||
|
|||
void cryptonight_av3_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
|
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
|
|||
VARIANT2_INIT(0); |
|||
VARIANT2_SET_ROUNDING_MODE(); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); |
|||
|
|||
uint64_t idx0 = al0; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
const __m128i ax0 = _mm_set_epi64x(ah0, al0); |
|||
|
|||
cx = soft_aesenc(cx, ax0); |
|||
|
|||
VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1); |
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); |
|||
|
|||
idx0 = _mm_cvtsi128_si64(cx); |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
|
|||
VARIANT2_INTEGER_MATH(0, cl, cx); |
|||
lo = _umul128(idx0, cl, &hi); |
|||
VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, hi, lo); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; |
|||
|
|||
al0 ^= cl; |
|||
ah0 ^= ch; |
|||
idx0 = al0; |
|||
|
|||
bx1 = bx0; |
|||
bx0 = cx; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
|
|||
keccakf(h0, 24); |
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
} |
@ -1,84 +0,0 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
|||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
|||
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "crypto/c_keccak.h" |
|||
#include "cryptonight.h" |
|||
#include "cryptonight_monero.h" |
|||
#include "cryptonight_softaes.h" |
|||
|
|||
|
|||
void cryptonight_av3_softaes(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx, uint8_t version) |
|||
{ |
|||
keccak((const uint8_t *) input, size, ctx->state0, 200); |
|||
|
|||
VARIANT1_INIT(0); |
|||
|
|||
cn_explode_scratchpad((__m128i*) ctx->state0, (__m128i*) ctx->memory); |
|||
|
|||
const uint8_t* l0 = ctx->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx->state0; |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx; |
|||
cx = _mm_load_si128((__m128i *)&l0[idx0 & 0x1FFFF0]); |
|||
cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0)); |
|||
|
|||
_mm_store_si128((__m128i *)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); |
|||
VARIANT1_1(&l0[idx0 & 0x1FFFF0]); |
|||
idx0 = EXTRACT64(cx); |
|||
bx0 = cx; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
VARIANT1_2(ah0, 0); |
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; |
|||
VARIANT1_2(ah0, 0); |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx->memory, (__m128i*) ctx->state0); |
|||
|
|||
keccakf(h0, 24); |
|||
extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output); |
|||
} |
@ -0,0 +1,304 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
|||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
|||
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "crypto/c_keccak.h" |
|||
#include "cryptonight.h" |
|||
#include "cryptonight_monero.h" |
|||
#include "cryptonight_softaes.h" |
|||
|
|||
|
|||
void cryptonight_av4_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
keccak(input + size, size, ctx[1]->state, 200); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
const uint8_t* l1 = ctx[1]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
uint64_t* h1 = (uint64_t*) ctx[1]->state; |
|||
|
|||
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); |
|||
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t al1 = h1[0] ^ h1[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
uint64_t ah1 = h1[1] ^ h1[5]; |
|||
|
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
uint64_t idx1 = h1[0] ^ h1[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]); |
|||
|
|||
cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0)); |
|||
cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1)); |
|||
|
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0)); |
|||
_mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1)); |
|||
|
|||
idx0 = EXTRACT64(cx0); |
|||
idx1 = EXTRACT64(cx1); |
|||
|
|||
bx0 = cx0; |
|||
bx1 = cx1; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
|
|||
cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx1, cl, &hi); |
|||
|
|||
al1 += hi; |
|||
ah1 += lo; |
|||
|
|||
((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1; |
|||
((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1; |
|||
|
|||
ah1 ^= ch; |
|||
al1 ^= cl; |
|||
idx1 = al1; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); |
|||
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); |
|||
|
|||
keccakf(h0, 24); |
|||
keccakf(h1, 24); |
|||
|
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); |
|||
} |
|||
|
|||
|
|||
void cryptonight_av4_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
if (size < 43) { |
|||
memset(output, 0, 64); |
|||
return; |
|||
} |
|||
|
|||
keccak(input, size, ctx[0]->state, 200); |
|||
keccak(input + size, size, ctx[1]->state, 200); |
|||
|
|||
VARIANT1_INIT(0); |
|||
VARIANT1_INIT(1); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
const uint8_t* l1 = ctx[1]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
uint64_t* h1 = (uint64_t*) ctx[1]->state; |
|||
|
|||
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); |
|||
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t al1 = h1[0] ^ h1[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
uint64_t ah1 = h1[1] ^ h1[5]; |
|||
|
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
uint64_t idx1 = h1[0] ^ h1[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]); |
|||
|
|||
cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0)); |
|||
cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1)); |
|||
|
|||
cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0)); |
|||
cryptonight_monero_tweak((uint64_t*)&l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1)); |
|||
|
|||
idx0 = EXTRACT64(cx0); |
|||
idx1 = EXTRACT64(cx1); |
|||
|
|||
bx0 = cx0; |
|||
bx1 = cx1; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0 ^ tweak1_2_0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
|
|||
cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx1, cl, &hi); |
|||
|
|||
al1 += hi; |
|||
ah1 += lo; |
|||
|
|||
((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1; |
|||
((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1 ^ tweak1_2_1; |
|||
|
|||
ah1 ^= ch; |
|||
al1 ^= cl; |
|||
idx1 = al1; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); |
|||
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); |
|||
|
|||
keccakf(h0, 24); |
|||
keccakf(h1, 24); |
|||
|
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); |
|||
} |
|||
|
|||
|
|||
void cryptonight_av4_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
keccak(input + size, size, ctx[1]->state, 200); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
const uint8_t* l1 = ctx[1]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
uint64_t* h1 = (uint64_t*) ctx[1]->state; |
|||
|
|||
VARIANT2_INIT(0); |
|||
VARIANT2_INIT(1); |
|||
VARIANT2_SET_ROUNDING_MODE(); |
|||
|
|||
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); |
|||
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t al1 = h1[0] ^ h1[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
uint64_t ah1 = h1[1] ^ h1[5]; |
|||
|
|||
__m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx01 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); |
|||
__m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); |
|||
__m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); |
|||
|
|||
uint64_t idx0 = al0; |
|||
uint64_t idx1 = al1; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]); |
|||
|
|||
const __m128i ax0 = _mm_set_epi64x(ah0, al0); |
|||
const __m128i ax1 = _mm_set_epi64x(ah1, al1); |
|||
|
|||
cx0 = soft_aesenc(cx0, ax0); |
|||
cx1 = soft_aesenc(cx1, ax1); |
|||
|
|||
VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01); |
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx00, cx0)); |
|||
|
|||
VARIANT2_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11); |
|||
_mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx10, cx1)); |
|||
|
|||
idx0 = _mm_cvtsi128_si64(cx0); |
|||
idx1 = _mm_cvtsi128_si64(cx1); |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
|
|||
VARIANT2_INTEGER_MATH(0, cl, cx0); |
|||
lo = _umul128(idx0, cl, &hi); |
|||
VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, hi, lo); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; |
|||
|
|||
al0 ^= cl; |
|||
ah0 ^= ch; |
|||
idx0 = al0; |
|||
|
|||
cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1]; |
|||
|
|||
VARIANT2_INTEGER_MATH(1, cl, cx1); |
|||
lo = _umul128(idx1, cl, &hi); |
|||
VARIANT2_SHUFFLE2(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, hi, lo); |
|||
|
|||
al1 += hi; |
|||
ah1 += lo; |
|||
|
|||
((uint64_t*)&l1[idx1 & 0x1FFFF0])[0] = al1; |
|||
((uint64_t*)&l1[idx1 & 0x1FFFF0])[1] = ah1; |
|||
|
|||
al1 ^= cl; |
|||
ah1 ^= ch; |
|||
idx1 = al1; |
|||
|
|||
bx01 = bx00; |
|||
bx11 = bx10; |
|||
|
|||
bx00 = cx0; |
|||
bx10 = cx1; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); |
|||
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); |
|||
|
|||
keccakf(h0, 24); |
|||
keccakf(h1, 24); |
|||
|
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); |
|||
} |
@ -1,123 +0,0 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
|||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
|||
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "crypto/c_keccak.h" |
|||
#include "cryptonight.h" |
|||
#include "cryptonight_monero.h" |
|||
#include "cryptonight_softaes.h" |
|||
|
|||
|
|||
void cryptonight_av4_softaes_double(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx, uint8_t version) |
|||
{ |
|||
keccak((const uint8_t *) input, size, ctx->state0, 200); |
|||
keccak((const uint8_t *) input + size, size, ctx->state1, 200); |
|||
|
|||
VARIANT1_INIT(0); |
|||
VARIANT1_INIT(1); |
|||
|
|||
const uint8_t* l0 = ctx->memory; |
|||
const uint8_t* l1 = ctx->memory + MEMORY; |
|||
uint64_t* h0 = (uint64_t*) ctx->state0; |
|||
uint64_t* h1 = (uint64_t*) ctx->state1; |
|||
|
|||
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); |
|||
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t al1 = h1[0] ^ h1[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
uint64_t ah1 = h1[1] ^ h1[5]; |
|||
|
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
uint64_t idx1 = h1[0] ^ h1[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]); |
|||
|
|||
cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0)); |
|||
cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1)); |
|||
|
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0)); |
|||
_mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1)); |
|||
|
|||
VARIANT1_1(&l0[idx0 & 0x1FFFF0]); |
|||
VARIANT1_1(&l1[idx1 & 0x1FFFF0]); |
|||
|
|||
idx0 = EXTRACT64(cx0); |
|||
idx1 = EXTRACT64(cx1); |
|||
|
|||
bx0 = cx0; |
|||
bx1 = cx1; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
VARIANT1_2(ah0, 0); |
|||
((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0; |
|||
VARIANT1_2(ah0, 0); |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
|
|||
cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx1, cl, &hi); |
|||
|
|||
al1 += hi; |
|||
ah1 += lo; |
|||
|
|||
VARIANT1_2(ah1, 1); |
|||
((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1; |
|||
((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1; |
|||
VARIANT1_2(ah1, 1); |
|||
|
|||
ah1 ^= ch; |
|||
al1 ^= cl; |
|||
idx1 = al1; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); |
|||
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); |
|||
|
|||
keccakf(h0, 24); |
|||
keccakf(h1, 24); |
|||
|
|||
extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output); |
|||
extra_hashes[ctx->state1[0] & 3](ctx->state1, 200, (char*) output + 32); |
|||
} |
@ -0,0 +1,33 @@ |
|||
if (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8) |
|||
set(XMRIG_ASM_LIBRARY "xmrig-asm") |
|||
|
|||
if (CMAKE_C_COMPILER_ID MATCHES MSVC) |
|||
enable_language(ASM_MASM) |
|||
|
|||
if (MSVC_TOOLSET_VERSION GREATER_EQUAL 141) |
|||
set(XMRIG_ASM_FILE "crypto/asm/cnv2_main_loop.asm") |
|||
else() |
|||
set(XMRIG_ASM_FILE "crypto/asm/win64/cnv2_main_loop.asm") |
|||
endif() |
|||
|
|||
set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY ASM_MASM) |
|||
else() |
|||
enable_language(ASM) |
|||
|
|||
if (WIN32 AND CMAKE_C_COMPILER_ID MATCHES GNU) |
|||
set(XMRIG_ASM_FILE "crypto/asm/win64/cnv2_main_loop.S") |
|||
else() |
|||
set(XMRIG_ASM_FILE "crypto/asm/cnv2_main_loop.S") |
|||
endif() |
|||
|
|||
set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY C) |
|||
endif() |
|||
|
|||
add_library(${XMRIG_ASM_LIBRARY} STATIC ${XMRIG_ASM_FILE}) |
|||
set(XMRIG_ASM_SOURCES "") |
|||
set_property(TARGET ${XMRIG_ASM_LIBRARY} PROPERTY LINKER_LANGUAGE C) |
|||
else() |
|||
set(XMRIG_ASM_SOURCES "") |
|||
set(XMRIG_ASM_LIBRARY "") |
|||
add_definitions(/DXMRIG_NO_ASM) |
|||
endif() |
@ -0,0 +1,410 @@ |
|||
mov rax, rsp |
|||
push rbx |
|||
push rbp |
|||
push rsi |
|||
push rdi |
|||
push r12 |
|||
push r13 |
|||
push r14 |
|||
push r15 |
|||
sub rsp, 184 |
|||
|
|||
stmxcsr DWORD PTR [rsp+272] |
|||
mov DWORD PTR [rsp+276], 24448 |
|||
ldmxcsr DWORD PTR [rsp+276] |
|||
|
|||
mov r13, QWORD PTR [rcx+224] |
|||
mov r9, rdx |
|||
mov r10, QWORD PTR [rcx+32] |
|||
mov r8, rcx |
|||
xor r10, QWORD PTR [rcx] |
|||
mov r14d, 524288 |
|||
mov r11, QWORD PTR [rcx+40] |
|||
xor r11, QWORD PTR [rcx+8] |
|||
mov rsi, QWORD PTR [rdx+224] |
|||
mov rdx, QWORD PTR [rcx+56] |
|||
xor rdx, QWORD PTR [rcx+24] |
|||
mov rdi, QWORD PTR [r9+32] |
|||
xor rdi, QWORD PTR [r9] |
|||
mov rbp, QWORD PTR [r9+40] |
|||
xor rbp, QWORD PTR [r9+8] |
|||
movq xmm0, rdx |
|||
movaps XMMWORD PTR [rax-88], xmm6 |
|||
movaps XMMWORD PTR [rax-104], xmm7 |
|||
movaps XMMWORD PTR [rax-120], xmm8 |
|||
movaps XMMWORD PTR [rsp+112], xmm9 |
|||
movaps XMMWORD PTR [rsp+96], xmm10 |
|||
movaps XMMWORD PTR [rsp+80], xmm11 |
|||
movaps XMMWORD PTR [rsp+64], xmm12 |
|||
movaps XMMWORD PTR [rsp+48], xmm13 |
|||
movaps XMMWORD PTR [rsp+32], xmm14 |
|||
movaps XMMWORD PTR [rsp+16], xmm15 |
|||
mov rdx, r10 |
|||
movq xmm4, QWORD PTR [r8+96] |
|||
and edx, 2097136 |
|||
mov rax, QWORD PTR [rcx+48] |
|||
xorps xmm13, xmm13 |
|||
xor rax, QWORD PTR [rcx+16] |
|||
mov rcx, QWORD PTR [rcx+88] |
|||
xor rcx, QWORD PTR [r8+72] |
|||
movq xmm5, QWORD PTR [r8+104] |
|||
movq xmm7, rax |
|||
|
|||
mov eax, 1 |
|||
shl rax, 52 |
|||
movq xmm14, rax |
|||
punpcklqdq xmm14, xmm14 |
|||
|
|||
mov eax, 1023 |
|||
shl rax, 52 |
|||
movq xmm12, rax |
|||
punpcklqdq xmm12, xmm12 |
|||
|
|||
mov rax, QWORD PTR [r8+80] |
|||
xor rax, QWORD PTR [r8+64] |
|||
punpcklqdq xmm7, xmm0 |
|||
movq xmm0, rcx |
|||
mov rcx, QWORD PTR [r9+56] |
|||
xor rcx, QWORD PTR [r9+24] |
|||
movq xmm3, rax |
|||
mov rax, QWORD PTR [r9+48] |
|||
xor rax, QWORD PTR [r9+16] |
|||
punpcklqdq xmm3, xmm0 |
|||
movq xmm0, rcx |
|||
mov QWORD PTR [rsp], r13 |
|||
mov rcx, QWORD PTR [r9+88] |
|||
xor rcx, QWORD PTR [r9+72] |
|||
movq xmm6, rax |
|||
mov rax, QWORD PTR [r9+80] |
|||
xor rax, QWORD PTR [r9+64] |
|||
punpcklqdq xmm6, xmm0 |
|||
movq xmm0, rcx |
|||
mov QWORD PTR [rsp+256], r10 |
|||
mov rcx, rdi |
|||
mov QWORD PTR [rsp+264], r11 |
|||
movq xmm8, rax |
|||
and ecx, 2097136 |
|||
punpcklqdq xmm8, xmm0 |
|||
movq xmm0, QWORD PTR [r9+96] |
|||
punpcklqdq xmm4, xmm0 |
|||
movq xmm0, QWORD PTR [r9+104] |
|||
lea r8, QWORD PTR [rcx+rsi] |
|||
movdqu xmm11, XMMWORD PTR [r8] |
|||
punpcklqdq xmm5, xmm0 |
|||
lea r9, QWORD PTR [rdx+r13] |
|||
movdqu xmm15, XMMWORD PTR [r9] |
|||
|
|||
ALIGN 16 |
|||
main_loop_double_sandybridge: |
|||
movdqu xmm9, xmm15 |
|||
mov eax, edx |
|||
mov ebx, edx |
|||
xor eax, 16 |
|||
xor ebx, 32 |
|||
xor edx, 48 |
|||
|
|||
movq xmm0, r11 |
|||
movq xmm2, r10 |
|||
punpcklqdq xmm2, xmm0 |
|||
aesenc xmm9, xmm2 |
|||
|
|||
movdqu xmm0, XMMWORD PTR [rax+r13] |
|||
movdqu xmm1, XMMWORD PTR [rbx+r13] |
|||
paddq xmm0, xmm7 |
|||
paddq xmm1, xmm2 |
|||
movdqu XMMWORD PTR [rbx+r13], xmm0 |
|||
movdqu xmm0, XMMWORD PTR [rdx+r13] |
|||
movdqu XMMWORD PTR [rdx+r13], xmm1 |
|||
paddq xmm0, xmm3 |
|||
movdqu XMMWORD PTR [rax+r13], xmm0 |
|||
|
|||
movq r11, xmm9 |
|||
mov edx, r11d |
|||
and edx, 2097136 |
|||
movdqa xmm0, xmm9 |
|||
pxor xmm0, xmm7 |
|||
movdqu XMMWORD PTR [r9], xmm0 |
|||
|
|||
lea rbx, QWORD PTR [rdx+r13] |
|||
mov r10, QWORD PTR [rdx+r13] |
|||
|
|||
movdqu xmm10, xmm11 |
|||
movq xmm0, rbp |
|||
movq xmm11, rdi |
|||
punpcklqdq xmm11, xmm0 |
|||
aesenc xmm10, xmm11 |
|||
|
|||
mov eax, ecx |
|||
mov r12d, ecx |
|||
xor eax, 16 |
|||
xor r12d, 32 |
|||
xor ecx, 48 |
|||
|
|||
movdqu xmm0, XMMWORD PTR [rax+rsi] |
|||
paddq xmm0, xmm6 |
|||
movdqu xmm1, XMMWORD PTR [r12+rsi] |
|||
movdqu XMMWORD PTR [r12+rsi], xmm0 |
|||
paddq xmm1, xmm11 |
|||
movdqu xmm0, XMMWORD PTR [rcx+rsi] |
|||
movdqu XMMWORD PTR [rcx+rsi], xmm1 |
|||
paddq xmm0, xmm8 |
|||
movdqu XMMWORD PTR [rax+rsi], xmm0 |
|||
|
|||
movq rcx, xmm10 |
|||
and ecx, 2097136 |
|||
|
|||
movdqa xmm0, xmm10 |
|||
pxor xmm0, xmm6 |
|||
movdqu XMMWORD PTR [r8], xmm0 |
|||
mov r12, QWORD PTR [rcx+rsi] |
|||
|
|||
mov r9, QWORD PTR [rbx+8] |
|||
|
|||
xor edx, 16 |
|||
mov r8d, edx |
|||
mov r15d, edx |
|||
|
|||
movq rdx, xmm5 |
|||
shl rdx, 32 |
|||
movq rax, xmm4 |
|||
xor rdx, rax |
|||
xor r10, rdx |
|||
mov rax, r10 |
|||
mul r11 |
|||
mov r11d, r8d |
|||
xor r11d, 48 |
|||
movq xmm0, rdx |
|||
xor rdx, [r11+r13] |
|||
movq xmm1, rax |
|||
xor rax, [r11+r13+8] |
|||
punpcklqdq xmm0, xmm1 |
|||
|
|||
pxor xmm0, XMMWORD PTR [r8+r13] |
|||
xor r8d, 32 |
|||
movdqu xmm1, XMMWORD PTR [r11+r13] |
|||
paddq xmm0, xmm7 |
|||
paddq xmm1, xmm2 |
|||
movdqu XMMWORD PTR [r11+r13], xmm0 |
|||
movdqu xmm0, XMMWORD PTR [r8+r13] |
|||
movdqu XMMWORD PTR [r8+r13], xmm1 |
|||
paddq xmm0, xmm3 |
|||
movdqu XMMWORD PTR [r15+r13], xmm0 |
|||
|
|||
mov r11, QWORD PTR [rsp+256] |
|||
add r11, rdx |
|||
mov rdx, QWORD PTR [rsp+264] |
|||
add rdx, rax |
|||
mov QWORD PTR [rbx], r11 |
|||
xor r11, r10 |
|||
mov QWORD PTR [rbx+8], rdx |
|||
xor rdx, r9 |
|||
mov QWORD PTR [rsp+256], r11 |
|||
and r11d, 2097136 |
|||
mov QWORD PTR [rsp+264], rdx |
|||
mov QWORD PTR [rsp+8], r11 |
|||
lea r15, QWORD PTR [r11+r13] |
|||
movdqu xmm15, XMMWORD PTR [r11+r13] |
|||
lea r13, QWORD PTR [rsi+rcx] |
|||
movdqa xmm0, xmm5 |
|||
psrldq xmm0, 8 |
|||
movaps xmm2, xmm13 |
|||
movq r10, xmm0 |
|||
psllq xmm5, 1 |
|||
shl r10, 32 |
|||
movdqa xmm0, xmm9 |
|||
psrldq xmm0, 8 |
|||
movdqa xmm1, xmm10 |
|||
movq r11, xmm0 |
|||
psrldq xmm1, 8 |
|||
movq r8, xmm1 |
|||
psrldq xmm4, 8 |
|||
movaps xmm0, xmm13 |
|||
movq rax, xmm4 |
|||
xor r10, rax |
|||
movaps xmm1, xmm13 |
|||
xor r10, r12 |
|||
lea rax, QWORD PTR [r11+1] |
|||
shr rax, 1 |
|||
movdqa xmm3, xmm9 |
|||
punpcklqdq xmm3, xmm10 |
|||
paddq xmm5, xmm3 |
|||
movq rdx, xmm5 |
|||
psrldq xmm5, 8 |
|||
cvtsi2sd xmm2, rax |
|||
or edx, -2147483647 |
|||
lea rax, QWORD PTR [r8+1] |
|||
shr rax, 1 |
|||
movq r9, xmm5 |
|||
cvtsi2sd xmm0, rax |
|||
or r9d, -2147483647 |
|||
cvtsi2sd xmm1, rdx |
|||
unpcklpd xmm2, xmm0 |
|||
movaps xmm0, xmm13 |
|||
cvtsi2sd xmm0, r9 |
|||
unpcklpd xmm1, xmm0 |
|||
divpd xmm2, xmm1 |
|||
paddq xmm2, xmm14 |
|||
cvttsd2si rax, xmm2 |
|||
psrldq xmm2, 8 |
|||
mov rbx, rax |
|||
imul rax, rdx |
|||
sub r11, rax |
|||
js div_fix_1_sandybridge |
|||
div_fix_1_ret_sandybridge: |
|||
|
|||
cvttsd2si rdx, xmm2 |
|||
mov rax, rdx |
|||
imul rax, r9 |
|||
movd xmm2, r11d |
|||
movd xmm4, ebx |
|||
sub r8, rax |
|||
js div_fix_2_sandybridge |
|||
div_fix_2_ret_sandybridge: |
|||
|
|||
movd xmm1, r8d |
|||
movd xmm0, edx |
|||
punpckldq xmm2, xmm1 |
|||
punpckldq xmm4, xmm0 |
|||
punpckldq xmm4, xmm2 |
|||
paddq xmm3, xmm4 |
|||
movdqa xmm0, xmm3 |
|||
psrlq xmm0, 12 |
|||
paddq xmm0, xmm12 |
|||
sqrtpd xmm1, xmm0 |
|||
movq r9, xmm1 |
|||
movdqa xmm5, xmm1 |
|||
psrlq xmm5, 19 |
|||
test r9, 524287 |
|||
je sqrt_fix_1_sandybridge |
|||
sqrt_fix_1_ret_sandybridge: |
|||
|
|||
movq r9, xmm10 |
|||
psrldq xmm1, 8 |
|||
movq r8, xmm1 |
|||
test r8, 524287 |
|||
je sqrt_fix_2_sandybridge |
|||
sqrt_fix_2_ret_sandybridge: |
|||
|
|||
mov r12d, ecx |
|||
mov r8d, ecx |
|||
xor r12d, 16 |
|||
xor r8d, 32 |
|||
xor ecx, 48 |
|||
mov rax, r10 |
|||
mul r9 |
|||
movq xmm0, rax |
|||
movq xmm3, rdx |
|||
punpcklqdq xmm3, xmm0 |
|||
|
|||
movdqu xmm0, XMMWORD PTR [r12+rsi] |
|||
pxor xmm0, xmm3 |
|||
movdqu xmm1, XMMWORD PTR [r8+rsi] |
|||
xor rdx, [r8+rsi] |
|||
xor rax, [r8+rsi+8] |
|||
movdqu xmm3, XMMWORD PTR [rcx+rsi] |
|||
paddq xmm0, xmm6 |
|||
paddq xmm1, xmm11 |
|||
paddq xmm3, xmm8 |
|||
movdqu XMMWORD PTR [r8+rsi], xmm0 |
|||
movdqu XMMWORD PTR [rcx+rsi], xmm1 |
|||
movdqu XMMWORD PTR [r12+rsi], xmm3 |
|||
|
|||
add rdi, rdx |
|||
mov QWORD PTR [r13], rdi |
|||
xor rdi, r10 |
|||
mov ecx, edi |
|||
and ecx, 2097136 |
|||
lea r8, QWORD PTR [rcx+rsi] |
|||
|
|||
mov rdx, QWORD PTR [r13+8] |
|||
add rbp, rax |
|||
mov QWORD PTR [r13+8], rbp |
|||
movdqu xmm11, XMMWORD PTR [rcx+rsi] |
|||
xor rbp, rdx |
|||
mov r13, QWORD PTR [rsp] |
|||
movdqa xmm3, xmm7 |
|||
mov rdx, QWORD PTR [rsp+8] |
|||
movdqa xmm8, xmm6 |
|||
mov r10, QWORD PTR [rsp+256] |
|||
movdqa xmm7, xmm9 |
|||
mov r11, QWORD PTR [rsp+264] |
|||
movdqa xmm6, xmm10 |
|||
mov r9, r15 |
|||
dec r14d |
|||
jne main_loop_double_sandybridge |
|||
|
|||
ldmxcsr DWORD PTR [rsp+272] |
|||
movaps xmm13, XMMWORD PTR [rsp+48] |
|||
lea r11, QWORD PTR [rsp+184] |
|||
movaps xmm6, XMMWORD PTR [r11-24] |
|||
movaps xmm7, XMMWORD PTR [r11-40] |
|||
movaps xmm8, XMMWORD PTR [r11-56] |
|||
movaps xmm9, XMMWORD PTR [r11-72] |
|||
movaps xmm10, XMMWORD PTR [r11-88] |
|||
movaps xmm11, XMMWORD PTR [r11-104] |
|||
movaps xmm12, XMMWORD PTR [r11-120] |
|||
movaps xmm14, XMMWORD PTR [rsp+32] |
|||
movaps xmm15, XMMWORD PTR [rsp+16] |
|||
mov rsp, r11 |
|||
pop r15 |
|||
pop r14 |
|||
pop r13 |
|||
pop r12 |
|||
pop rdi |
|||
pop rsi |
|||
pop rbp |
|||
pop rbx |
|||
jmp cnv2_double_mainloop_asm_sandybridge_endp |
|||
|
|||
div_fix_1_sandybridge: |
|||
dec rbx |
|||
add r11, rdx |
|||
jmp div_fix_1_ret_sandybridge |
|||
|
|||
div_fix_2_sandybridge: |
|||
dec rdx |
|||
add r8, r9 |
|||
jmp div_fix_2_ret_sandybridge |
|||
|
|||
sqrt_fix_1_sandybridge: |
|||
movq r8, xmm3 |
|||
movdqa xmm0, xmm5 |
|||
psrldq xmm0, 8 |
|||
dec r9 |
|||
mov r11d, -1022 |
|||
shl r11, 32 |
|||
mov rax, r9 |
|||
shr r9, 19 |
|||
shr rax, 20 |
|||
mov rdx, r9 |
|||
sub rdx, rax |
|||
lea rdx, [rdx+r11+1] |
|||
add rax, r11 |
|||
imul rdx, rax |
|||
sub rdx, r8 |
|||
adc r9, 0 |
|||
movq xmm5, r9 |
|||
punpcklqdq xmm5, xmm0 |
|||
jmp sqrt_fix_1_ret_sandybridge |
|||
|
|||
sqrt_fix_2_sandybridge: |
|||
psrldq xmm3, 8 |
|||
movq r11, xmm3 |
|||
dec r8 |
|||
mov ebx, -1022 |
|||
shl rbx, 32 |
|||
mov rax, r8 |
|||
shr r8, 19 |
|||
shr rax, 20 |
|||
mov rdx, r8 |
|||
sub rdx, rax |
|||
lea rdx, [rdx+rbx+1] |
|||
add rax, rbx |
|||
imul rdx, rax |
|||
sub rdx, r11 |
|||
adc r8, 0 |
|||
movq xmm0, r8 |
|||
punpcklqdq xmm5, xmm0 |
|||
jmp sqrt_fix_2_ret_sandybridge |
|||
|
|||
cnv2_double_mainloop_asm_sandybridge_endp: |
@ -0,0 +1,37 @@ |
|||
#define ALIGN .align |
|||
.intel_syntax noprefix |
|||
#ifdef __APPLE__ |
|||
# define FN_PREFIX(fn) _ ## fn |
|||
.text |
|||
#else |
|||
# define FN_PREFIX(fn) fn |
|||
.section .text |
|||
#endif |
|||
.global FN_PREFIX(cnv2_mainloop_ivybridge_asm) |
|||
.global FN_PREFIX(cnv2_mainloop_ryzen_asm) |
|||
.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm) |
|||
|
|||
ALIGN 16 |
|||
FN_PREFIX(cnv2_mainloop_ivybridge_asm): |
|||
sub rsp, 48 |
|||
mov rcx, rdi |
|||
#include "cnv2_main_loop_ivybridge.inc" |
|||
add rsp, 48 |
|||
ret 0 |
|||
|
|||
ALIGN 16 |
|||
FN_PREFIX(cnv2_mainloop_ryzen_asm): |
|||
sub rsp, 48 |
|||
mov rcx, rdi |
|||
#include "cnv2_main_loop_ryzen.inc" |
|||
add rsp, 48 |
|||
ret 0 |
|||
|
|||
ALIGN 16 |
|||
FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): |
|||
sub rsp, 48 |
|||
mov rcx, rdi |
|||
mov rdx, rsi |
|||
#include "cnv2_double_main_loop_sandybridge.inc" |
|||
add rsp, 48 |
|||
ret 0 |
@ -0,0 +1,25 @@ |
|||
_TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE |
|||
PUBLIC cnv2_mainloop_ivybridge_asm |
|||
PUBLIC cnv2_mainloop_ryzen_asm |
|||
PUBLIC cnv2_double_mainloop_sandybridge_asm |
|||
|
|||
ALIGN 64 |
|||
cnv2_mainloop_ivybridge_asm PROC |
|||
INCLUDE cnv2_main_loop_ivybridge.inc |
|||
ret 0 |
|||
cnv2_mainloop_ivybridge_asm ENDP |
|||
|
|||
ALIGN 64 |
|||
cnv2_mainloop_ryzen_asm PROC |
|||
INCLUDE cnv2_main_loop_ryzen.inc |
|||
ret 0 |
|||
cnv2_mainloop_ryzen_asm ENDP |
|||
|
|||
ALIGN 64 |
|||
cnv2_double_mainloop_sandybridge_asm PROC |
|||
INCLUDE cnv2_double_main_loop_sandybridge.inc |
|||
ret 0 |
|||
cnv2_double_mainloop_sandybridge_asm ENDP |
|||
|
|||
_TEXT_CNV2_MAINLOOP ENDS |
|||
END |
@ -0,0 +1,186 @@ |
|||
mov QWORD PTR [rsp+24], rbx |
|||
push rbp |
|||
push rsi |
|||
push rdi |
|||
push r12 |
|||
push r13 |
|||
push r14 |
|||
push r15 |
|||
sub rsp, 80 |
|||
|
|||
stmxcsr DWORD PTR [rsp] |
|||
mov DWORD PTR [rsp+4], 24448 |
|||
ldmxcsr DWORD PTR [rsp+4] |
|||
|
|||
mov rax, QWORD PTR [rcx+48] |
|||
mov r9, rcx |
|||
xor rax, QWORD PTR [rcx+16] |
|||
mov esi, 524288 |
|||
mov r8, QWORD PTR [rcx+32] |
|||
mov r13d, -2147483647 |
|||
xor r8, QWORD PTR [rcx] |
|||
mov r11, QWORD PTR [rcx+40] |
|||
mov r10, r8 |
|||
mov rdx, QWORD PTR [rcx+56] |
|||
movq xmm4, rax |
|||
xor rdx, QWORD PTR [rcx+24] |
|||
xor r11, QWORD PTR [rcx+8] |
|||
mov rbx, QWORD PTR [rcx+224] |
|||
mov rax, QWORD PTR [r9+80] |
|||
xor rax, QWORD PTR [r9+64] |
|||
movq xmm0, rdx |
|||
mov rcx, QWORD PTR [rcx+88] |
|||
xor rcx, QWORD PTR [r9+72] |
|||
movq xmm3, QWORD PTR [r9+104] |
|||
movaps XMMWORD PTR [rsp+64], xmm6 |
|||
movaps XMMWORD PTR [rsp+48], xmm7 |
|||
movaps XMMWORD PTR [rsp+32], xmm8 |
|||
and r10d, 2097136 |
|||
movq xmm5, rax |
|||
|
|||
xor eax, eax |
|||
mov QWORD PTR [rsp+16], rax |
|||
|
|||
mov ax, 1023 |
|||
shl rax, 52 |
|||
movq xmm8, rax |
|||
mov r15, QWORD PTR [r9+96] |
|||
punpcklqdq xmm4, xmm0 |
|||
movq xmm0, rcx |
|||
punpcklqdq xmm5, xmm0 |
|||
movdqu xmm6, XMMWORD PTR [r10+rbx] |
|||
|
|||
ALIGN 16 |
|||
main_loop_ivybridge: |
|||
lea rdx, QWORD PTR [r10+rbx] |
|||
mov ecx, r10d |
|||
mov eax, r10d |
|||
mov rdi, r15 |
|||
xor ecx, 16 |
|||
xor eax, 32 |
|||
xor r10d, 48 |
|||
movq xmm0, r11 |
|||
movq xmm7, r8 |
|||
punpcklqdq xmm7, xmm0 |
|||
aesenc xmm6, xmm7 |
|||
movq rbp, xmm6 |
|||
mov r9, rbp |
|||
and r9d, 2097136 |
|||
movdqu xmm2, XMMWORD PTR [rcx+rbx] |
|||
movdqu xmm1, XMMWORD PTR [rax+rbx] |
|||
movdqu xmm0, XMMWORD PTR [r10+rbx] |
|||
paddq xmm1, xmm7 |
|||
paddq xmm0, xmm5 |
|||
paddq xmm2, xmm4 |
|||
movdqu XMMWORD PTR [rcx+rbx], xmm0 |
|||
movdqu XMMWORD PTR [rax+rbx], xmm2 |
|||
movdqu XMMWORD PTR [r10+rbx], xmm1 |
|||
mov r10, r9 |
|||
xor r10d, 32 |
|||
movq rcx, xmm3 |
|||
mov rax, rcx |
|||
shl rax, 32 |
|||
xor rdi, rax |
|||
movdqa xmm0, xmm6 |
|||
pxor xmm0, xmm4 |
|||
movdqu XMMWORD PTR [rdx], xmm0 |
|||
xor rdi, QWORD PTR [r9+rbx] |
|||
lea r14, QWORD PTR [r9+rbx] |
|||
mov r12, QWORD PTR [r14+8] |
|||
xor edx, edx |
|||
lea r9d, DWORD PTR [ecx+ecx] |
|||
add r9d, ebp |
|||
movdqa xmm0, xmm6 |
|||
psrldq xmm0, 8 |
|||
or r9d, r13d |
|||
movq rax, xmm0 |
|||
div r9 |
|||
xorps xmm3, xmm3 |
|||
mov eax, eax |
|||
shl rdx, 32 |
|||
add rdx, rax |
|||
lea r9, QWORD PTR [rdx+rbp] |
|||
mov r15, rdx |
|||
mov rax, r9 |
|||
shr rax, 12 |
|||
movq xmm0, rax |
|||
paddq xmm0, xmm8 |
|||
sqrtsd xmm3, xmm0 |
|||
psubq xmm3, XMMWORD PTR [rsp+16] |
|||
movq rdx, xmm3 |
|||
test edx, 524287 |
|||
je sqrt_fixup_ivybridge |
|||
psrlq xmm3, 19 |
|||
sqrt_fixup_ivybridge_ret: |
|||
|
|||
mov ecx, r10d |
|||
mov rax, rdi |
|||
mul rbp |
|||
movq xmm2, rdx |
|||
xor rdx, [rcx+rbx] |
|||
add r8, rdx |
|||
mov QWORD PTR [r14], r8 |
|||
xor r8, rdi |
|||
mov edi, r8d |
|||
and edi, 2097136 |
|||
movq xmm0, rax |
|||
xor rax, [rcx+rbx+8] |
|||
add r11, rax |
|||
mov QWORD PTR [r14+8], r11 |
|||
punpcklqdq xmm2, xmm0 |
|||
|
|||
mov r9d, r10d |
|||
xor r9d, 48 |
|||
xor r10d, 16 |
|||
pxor xmm2, XMMWORD PTR [r9+rbx] |
|||
movdqu xmm0, XMMWORD PTR [r10+rbx] |
|||
paddq xmm0, xmm5 |
|||
movdqu xmm1, XMMWORD PTR [rcx+rbx] |
|||
paddq xmm2, xmm4 |
|||
paddq xmm1, xmm7 |
|||
movdqa xmm5, xmm4 |
|||
movdqu XMMWORD PTR [r9+rbx], xmm0 |
|||
movdqa xmm4, xmm6 |
|||
movdqu XMMWORD PTR [rcx+rbx], xmm2 |
|||
movdqu XMMWORD PTR [r10+rbx], xmm1 |
|||
movdqu xmm6, [rdi+rbx] |
|||
mov r10d, edi |
|||
xor r11, r12 |
|||
dec rsi |
|||
jne main_loop_ivybridge |
|||
|
|||
ldmxcsr DWORD PTR [rsp] |
|||
mov rbx, QWORD PTR [rsp+160] |
|||
movaps xmm6, XMMWORD PTR [rsp+64] |
|||
movaps xmm7, XMMWORD PTR [rsp+48] |
|||
movaps xmm8, XMMWORD PTR [rsp+32] |
|||
add rsp, 80 |
|||
pop r15 |
|||
pop r14 |
|||
pop r13 |
|||
pop r12 |
|||
pop rdi |
|||
pop rsi |
|||
pop rbp |
|||
jmp cnv2_main_loop_ivybridge_endp |
|||
|
|||
sqrt_fixup_ivybridge: |
|||
dec rdx |
|||
mov r13d, -1022 |
|||
shl r13, 32 |
|||
mov rax, rdx |
|||
shr rdx, 19 |
|||
shr rax, 20 |
|||
mov rcx, rdx |
|||
sub rcx, rax |
|||
add rax, r13 |
|||
not r13 |
|||
sub rcx, r13 |
|||
mov r13d, -2147483647 |
|||
imul rcx, rax |
|||
sub rcx, r9 |
|||
adc rdx, 0 |
|||
movq xmm3, rdx |
|||
jmp sqrt_fixup_ivybridge_ret |
|||
|
|||
cnv2_main_loop_ivybridge_endp: |
@ -0,0 +1,179 @@ |
|||
mov QWORD PTR [rsp+16], rbx |
|||
mov QWORD PTR [rsp+24], rbp |
|||
mov QWORD PTR [rsp+32], rsi |
|||
push rdi |
|||
push r12 |
|||
push r13 |
|||
push r14 |
|||
push r15 |
|||
sub rsp, 64 |
|||
|
|||
stmxcsr DWORD PTR [rsp] |
|||
mov DWORD PTR [rsp+4], 24448 |
|||
ldmxcsr DWORD PTR [rsp+4] |
|||
|
|||
mov rax, QWORD PTR [rcx+48] |
|||
mov r9, rcx |
|||
xor rax, QWORD PTR [rcx+16] |
|||
mov ebp, 524288 |
|||
mov r8, QWORD PTR [rcx+32] |
|||
xor r8, QWORD PTR [rcx] |
|||
mov r11, QWORD PTR [rcx+40] |
|||
mov r10, r8 |
|||
mov rdx, QWORD PTR [rcx+56] |
|||
movq xmm3, rax |
|||
xor rdx, QWORD PTR [rcx+24] |
|||
xor r11, QWORD PTR [rcx+8] |
|||
mov rbx, QWORD PTR [rcx+224] |
|||
mov rax, QWORD PTR [r9+80] |
|||
xor rax, QWORD PTR [r9+64] |
|||
movq xmm0, rdx |
|||
mov rcx, QWORD PTR [rcx+88] |
|||
xor rcx, QWORD PTR [r9+72] |
|||
mov rdi, QWORD PTR [r9+104] |
|||
and r10d, 2097136 |
|||
movaps XMMWORD PTR [rsp+48], xmm6 |
|||
movq xmm4, rax |
|||
movaps XMMWORD PTR [rsp+32], xmm7 |
|||
movaps XMMWORD PTR [rsp+16], xmm8 |
|||
xorps xmm8, xmm8 |
|||
mov ax, 1023 |
|||
shl rax, 52 |
|||
movq xmm7, rax |
|||
mov r15, QWORD PTR [r9+96] |
|||
punpcklqdq xmm3, xmm0 |
|||
movq xmm0, rcx |
|||
punpcklqdq xmm4, xmm0 |
|||
|
|||
ALIGN 16 |
|||
main_loop_ryzen: |
|||
movdqa xmm5, XMMWORD PTR [r10+rbx] |
|||
movq xmm0, r11 |
|||
movq xmm6, r8 |
|||
punpcklqdq xmm6, xmm0 |
|||
lea rdx, QWORD PTR [r10+rbx] |
|||
lea r9, QWORD PTR [rdi+rdi] |
|||
shl rdi, 32 |
|||
|
|||
mov ecx, r10d |
|||
mov eax, r10d |
|||
xor ecx, 16 |
|||
xor eax, 32 |
|||
xor r10d, 48 |
|||
aesenc xmm5, xmm6 |
|||
movdqa xmm2, XMMWORD PTR [rcx+rbx] |
|||
movdqa xmm1, XMMWORD PTR [rax+rbx] |
|||
movdqa xmm0, XMMWORD PTR [r10+rbx] |
|||
paddq xmm2, xmm3 |
|||
paddq xmm1, xmm6 |
|||
paddq xmm0, xmm4 |
|||
movdqa XMMWORD PTR [rcx+rbx], xmm0 |
|||
movdqa XMMWORD PTR [rax+rbx], xmm2 |
|||
movdqa XMMWORD PTR [r10+rbx], xmm1 |
|||
|
|||
movaps xmm1, xmm8 |
|||
mov rsi, r15 |
|||
xor rsi, rdi |
|||
movq r14, xmm5 |
|||
movdqa xmm0, xmm5 |
|||
pxor xmm0, xmm3 |
|||
mov r10, r14 |
|||
and r10d, 2097136 |
|||
movdqa XMMWORD PTR [rdx], xmm0 |
|||
xor rsi, QWORD PTR [r10+rbx] |
|||
lea r12, QWORD PTR [r10+rbx] |
|||
mov r13, QWORD PTR [r10+rbx+8] |
|||
|
|||
add r9d, r14d |
|||
or r9d, -2147483647 |
|||
xor edx, edx |
|||
movdqa xmm0, xmm5 |
|||
psrldq xmm0, 8 |
|||
movq rax, xmm0 |
|||
|
|||
div r9 |
|||
movq xmm0, rax |
|||
movq xmm1, rdx |
|||
punpckldq xmm0, xmm1 |
|||
movq r15, xmm0 |
|||
paddq xmm0, xmm5 |
|||
movdqa xmm2, xmm0 |
|||
psrlq xmm0, 12 |
|||
paddq xmm0, xmm7 |
|||
sqrtsd xmm1, xmm0 |
|||
movq rdi, xmm1 |
|||
test rdi, 524287 |
|||
je sqrt_fixup_ryzen |
|||
shr rdi, 19 |
|||
|
|||
sqrt_fixup_ryzen_ret: |
|||
mov rax, rsi |
|||
mul r14 |
|||
movq xmm1, rax |
|||
movq xmm0, rdx |
|||
punpcklqdq xmm0, xmm1 |
|||
|
|||
mov r9d, r10d |
|||
mov ecx, r10d |
|||
xor r9d, 16 |
|||
xor ecx, 32 |
|||
xor r10d, 48 |
|||
movdqa xmm1, XMMWORD PTR [rcx+rbx] |
|||
xor rdx, [rcx+rbx] |
|||
xor rax, [rcx+rbx+8] |
|||
movdqa xmm2, XMMWORD PTR [r9+rbx] |
|||
pxor xmm2, xmm0 |
|||
paddq xmm4, XMMWORD PTR [r10+rbx] |
|||
paddq xmm2, xmm3 |
|||
paddq xmm1, xmm6 |
|||
movdqa XMMWORD PTR [r9+rbx], xmm4 |
|||
movdqa XMMWORD PTR [rcx+rbx], xmm2 |
|||
movdqa XMMWORD PTR [r10+rbx], xmm1 |
|||
|
|||
movdqa xmm4, xmm3 |
|||
add r8, rdx |
|||
add r11, rax |
|||
mov QWORD PTR [r12], r8 |
|||
xor r8, rsi |
|||
mov QWORD PTR [r12+8], r11 |
|||
mov r10, r8 |
|||
xor r11, r13 |
|||
and r10d, 2097136 |
|||
movdqa xmm3, xmm5 |
|||
dec ebp |
|||
jne main_loop_ryzen |
|||
|
|||
ldmxcsr DWORD PTR [rsp] |
|||
movaps xmm6, XMMWORD PTR [rsp+48] |
|||
lea r11, QWORD PTR [rsp+64] |
|||
mov rbx, QWORD PTR [r11+56] |
|||
mov rbp, QWORD PTR [r11+64] |
|||
mov rsi, QWORD PTR [r11+72] |
|||
movaps xmm8, XMMWORD PTR [r11-48] |
|||
movaps xmm7, XMMWORD PTR [rsp+32] |
|||
mov rsp, r11 |
|||
pop r15 |
|||
pop r14 |
|||
pop r13 |
|||
pop r12 |
|||
pop rdi |
|||
jmp cnv2_main_loop_ryzen_endp |
|||
|
|||
sqrt_fixup_ryzen: |
|||
movq r9, xmm2 |
|||
dec rdi |
|||
mov edx, -1022 |
|||
shl rdx, 32 |
|||
mov rax, rdi |
|||
shr rdi, 19 |
|||
shr rax, 20 |
|||
mov rcx, rdi |
|||
sub rcx, rax |
|||
lea rcx, [rcx+rdx+1] |
|||
add rax, rdx |
|||
imul rcx, rax |
|||
sub rcx, r9 |
|||
adc rdi, 0 |
|||
jmp sqrt_fixup_ryzen_ret |
|||
|
|||
cnv2_main_loop_ryzen_endp: |
@ -0,0 +1,21 @@ |
|||
#define ALIGN .align |
|||
.intel_syntax noprefix |
|||
.section .text |
|||
.global cnv2_mainloop_ivybridge_asm |
|||
.global cnv2_mainloop_ryzen_asm |
|||
.global cnv2_double_mainloop_sandybridge_asm |
|||
|
|||
ALIGN 16 |
|||
cnv2_mainloop_ivybridge_asm: |
|||
#include "../cnv2_main_loop_ivybridge.inc" |
|||
ret 0 |
|||
|
|||
ALIGN 16 |
|||
cnv2_mainloop_ryzen_asm: |
|||
#include "../cnv2_main_loop_ryzen.inc" |
|||
ret 0 |
|||
|
|||
ALIGN 16 |
|||
cnv2_double_mainloop_sandybridge_asm: |
|||
#include "../cnv2_double_main_loop_sandybridge.inc" |
|||
ret 0 |
Loading…
Reference in new issue