Compare commits
28 Commits
master
...
classic-de
Author | SHA1 | Date |
---|---|---|
![]() |
d92c1a54de | 5 years ago |
![]() |
aa474fa51b | 5 years ago |
![]() |
7976059367 | 5 years ago |
![]() |
c5cbd9d8fe | 5 years ago |
![]() |
ef2e8bed6e | 5 years ago |
![]() |
7574bfab60 | 5 years ago |
![]() |
27980f24f8 | 5 years ago |
![]() |
5e6a69e16f | 5 years ago |
![]() |
69513e7049 | 5 years ago |
![]() |
b834c50aba | 6 years ago |
![]() |
302ebe5a5b | 6 years ago |
![]() |
b9096f2392 | 6 years ago |
![]() |
b02f4ff163 | 6 years ago |
![]() |
11748fad78 | 6 years ago |
![]() |
e0dc51edf9 | 6 years ago |
![]() |
779238fc85 | 6 years ago |
![]() |
a06a224c0a | 6 years ago |
![]() |
bf2eb1a685 | 6 years ago |
![]() |
0bba8849f0 | 6 years ago |
![]() |
1e22a984af | 6 years ago |
![]() |
61b49137c7 | 6 years ago |
![]() |
93d072ff6e | 6 years ago |
![]() |
f0b293f650 | 6 years ago |
![]() |
b93e7d9daa | 6 years ago |
![]() |
0b4b07fcd6 | 6 years ago |
![]() |
af62621169 | 6 years ago |
![]() |
ed7260449a | 6 years ago |
![]() |
33944595a2 | 6 years ago |
62 changed files with 8677 additions and 1261 deletions
@ -0,0 +1,134 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
|||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
|||
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "algo/cryptonight/cryptonight.h" |
|||
#include "algo/cryptonight/cryptonight_monero.h" |
|||
#include "crypto/c_keccak.h" |
|||
#include "cryptonight_lite_aesni.h" |
|||
|
|||
|
|||
void cryptonight_lite_av1_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
|
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) { |
|||
__m128i cx; |
|||
cx = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]); |
|||
cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); |
|||
|
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx)); |
|||
idx0 = EXTRACT64(cx); |
|||
bx0 = cx; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0xFFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0xFFFF0])[1] = ah0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
|
|||
keccakf(h0, 24); |
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
} |
|||
|
|||
|
|||
void cryptonight_lite_av1_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
if (size < 43) { |
|||
memset(output, 0, 32); |
|||
return; |
|||
} |
|||
|
|||
keccak(input, size, ctx[0]->state, 200); |
|||
|
|||
VARIANT1_INIT(0); |
|||
|
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) { |
|||
__m128i cx; |
|||
cx = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]); |
|||
cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); |
|||
|
|||
cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx)); |
|||
|
|||
idx0 = EXTRACT64(cx); |
|||
bx0 = cx; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0xFFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0xFFFF0])[1] = ah0 ^ tweak1_2_0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
|
|||
keccakf(h0, 24); |
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
} |
@ -1,77 +0,0 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2016-2017 XMRig <support@xmrig.com> |
|||
* |
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "algo/cryptonight/cryptonight.h" |
|||
#include "cryptonight_lite_aesni.h" |
|||
#include "crypto/c_keccak.h" |
|||
|
|||
|
|||
void cryptonight_lite_av1_aesni(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx) |
|||
{ |
|||
keccak((const uint8_t *) input, size, ctx->state0, 200); |
|||
|
|||
cn_explode_scratchpad((__m128i*) ctx->state0, (__m128i*) ctx->memory); |
|||
|
|||
const uint8_t* l0 = ctx->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx->state0; |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) { |
|||
__m128i cx; |
|||
cx = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]); |
|||
cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); |
|||
|
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx)); |
|||
idx0 = EXTRACT64(cx); |
|||
bx0 = cx; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0xFFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0xFFFF0])[1] = ah0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx->memory, (__m128i*) ctx->state0); |
|||
|
|||
keccakf(h0, 24); |
|||
extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output); |
|||
} |
@ -0,0 +1,202 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
|||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
|||
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "algo/cryptonight/cryptonight.h" |
|||
#include "algo/cryptonight/cryptonight_monero.h" |
|||
#include "cryptonight_lite_aesni.h" |
|||
#include "crypto/c_keccak.h" |
|||
|
|||
|
|||
void cryptonight_lite_av2_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
keccak(input + size, size, ctx[1]->state, 200); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
const uint8_t* l1 = ctx[1]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
uint64_t* h1 = (uint64_t*) ctx[1]->state; |
|||
|
|||
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); |
|||
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t al1 = h1[0] ^ h1[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
uint64_t ah1 = h1[1] ^ h1[5]; |
|||
|
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
uint64_t idx1 = h1[0] ^ h1[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) { |
|||
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]); |
|||
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0xFFFF0]); |
|||
|
|||
cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); |
|||
cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); |
|||
|
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx0)); |
|||
_mm_store_si128((__m128i *) &l1[idx1 & 0xFFFF0], _mm_xor_si128(bx1, cx1)); |
|||
|
|||
idx0 = EXTRACT64(cx0); |
|||
idx1 = EXTRACT64(cx1); |
|||
|
|||
bx0 = cx0; |
|||
bx1 = cx1; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*) &l0[idx0 & 0xFFFF0])[0] = al0; |
|||
((uint64_t*) &l0[idx0 & 0xFFFF0])[1] = ah0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
|
|||
cl = ((uint64_t*) &l1[idx1 & 0xFFFF0])[0]; |
|||
ch = ((uint64_t*) &l1[idx1 & 0xFFFF0])[1]; |
|||
lo = _umul128(idx1, cl, &hi); |
|||
|
|||
al1 += hi; |
|||
ah1 += lo; |
|||
|
|||
((uint64_t*) &l1[idx1 & 0xFFFF0])[0] = al1; |
|||
((uint64_t*) &l1[idx1 & 0xFFFF0])[1] = ah1; |
|||
|
|||
ah1 ^= ch; |
|||
al1 ^= cl; |
|||
idx1 = al1; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); |
|||
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); |
|||
|
|||
keccakf(h0, 24); |
|||
keccakf(h1, 24); |
|||
|
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, (char*) output + 32); |
|||
} |
|||
|
|||
|
|||
void cryptonight_lite_av2_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
if (size < 43) { |
|||
memset(output, 0, 64); |
|||
return; |
|||
} |
|||
|
|||
keccak(input, size, ctx[0]->state, 200); |
|||
keccak(input + size, size, ctx[1]->state, 200); |
|||
|
|||
VARIANT1_INIT(0); |
|||
VARIANT1_INIT(1); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
const uint8_t* l1 = ctx[1]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
uint64_t* h1 = (uint64_t*) ctx[1]->state; |
|||
|
|||
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); |
|||
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t al1 = h1[0] ^ h1[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
uint64_t ah1 = h1[1] ^ h1[5]; |
|||
|
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
uint64_t idx1 = h1[0] ^ h1[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) { |
|||
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]); |
|||
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0xFFFF0]); |
|||
|
|||
cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); |
|||
cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); |
|||
|
|||
cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx0)); |
|||
cryptonight_monero_tweak((uint64_t*)&l1[idx1 & 0xFFFF0], _mm_xor_si128(bx1, cx1)); |
|||
|
|||
idx0 = EXTRACT64(cx0); |
|||
idx1 = EXTRACT64(cx1); |
|||
|
|||
bx0 = cx0; |
|||
bx1 = cx1; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*) &l0[idx0 & 0xFFFF0])[0] = al0; |
|||
((uint64_t*) &l0[idx0 & 0xFFFF0])[1] = ah0 ^ tweak1_2_0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
|
|||
cl = ((uint64_t*) &l1[idx1 & 0xFFFF0])[0]; |
|||
ch = ((uint64_t*) &l1[idx1 & 0xFFFF0])[1]; |
|||
lo = _umul128(idx1, cl, &hi); |
|||
|
|||
al1 += hi; |
|||
ah1 += lo; |
|||
|
|||
((uint64_t*) &l1[idx1 & 0xFFFF0])[0] = al1; |
|||
((uint64_t*) &l1[idx1 & 0xFFFF0])[1] = ah1 ^ tweak1_2_1; |
|||
|
|||
ah1 ^= ch; |
|||
al1 ^= cl; |
|||
idx1 = al1; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); |
|||
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); |
|||
|
|||
keccakf(h0, 24); |
|||
keccakf(h1, 24); |
|||
|
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, (char*) output + 32); |
|||
} |
@ -1,111 +0,0 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2016-2017 XMRig <support@xmrig.com> |
|||
* |
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "algo/cryptonight/cryptonight.h" |
|||
#include "cryptonight_lite_aesni.h" |
|||
#include "crypto/c_keccak.h" |
|||
|
|||
|
|||
void cryptonight_lite_av2_aesni_double(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx) |
|||
{ |
|||
keccak((const uint8_t *) input, size, ctx->state0, 200); |
|||
keccak((const uint8_t *) input + size, size, ctx->state1, 200); |
|||
|
|||
const uint8_t* l0 = ctx->memory; |
|||
const uint8_t* l1 = ctx->memory + MEMORY_LITE; |
|||
uint64_t* h0 = (uint64_t*) ctx->state0; |
|||
uint64_t* h1 = (uint64_t*) ctx->state1; |
|||
|
|||
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); |
|||
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t al1 = h1[0] ^ h1[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
uint64_t ah1 = h1[1] ^ h1[5]; |
|||
|
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
uint64_t idx1 = h1[0] ^ h1[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) { |
|||
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]); |
|||
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0xFFFF0]); |
|||
|
|||
cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); |
|||
cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); |
|||
|
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx0)); |
|||
_mm_store_si128((__m128i *) &l1[idx1 & 0xFFFF0], _mm_xor_si128(bx1, cx1)); |
|||
|
|||
idx0 = EXTRACT64(cx0); |
|||
idx1 = EXTRACT64(cx1); |
|||
|
|||
bx0 = cx0; |
|||
bx1 = cx1; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*) &l0[idx0 & 0xFFFF0])[0] = al0; |
|||
((uint64_t*) &l0[idx0 & 0xFFFF0])[1] = ah0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
|
|||
cl = ((uint64_t*) &l1[idx1 & 0xFFFF0])[0]; |
|||
ch = ((uint64_t*) &l1[idx1 & 0xFFFF0])[1]; |
|||
lo = _umul128(idx1, cl, &hi); |
|||
|
|||
al1 += hi; |
|||
ah1 += lo; |
|||
|
|||
((uint64_t*) &l1[idx1 & 0xFFFF0])[0] = al1; |
|||
((uint64_t*) &l1[idx1 & 0xFFFF0])[1] = ah1; |
|||
|
|||
ah1 ^= ch; |
|||
al1 ^= cl; |
|||
idx1 = al1; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); |
|||
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); |
|||
|
|||
keccakf(h0, 24); |
|||
keccakf(h1, 24); |
|||
|
|||
extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output); |
|||
extra_hashes[ctx->state1[0] & 3](ctx->state1, 200, (char*) output + 32); |
|||
} |
@ -0,0 +1,134 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
|||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
|||
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "algo/cryptonight/cryptonight.h" |
|||
#include "algo/cryptonight/cryptonight_monero.h" |
|||
#include "cryptonight_lite_softaes.h" |
|||
#include "crypto/c_keccak.h" |
|||
|
|||
|
|||
void cryptonight_lite_av3_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
|
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) { |
|||
__m128i cx; |
|||
cx = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]); |
|||
cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0)); |
|||
|
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx)); |
|||
idx0 = EXTRACT64(cx); |
|||
bx0 = cx; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0xFFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0xFFFF0])[1] = ah0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
|
|||
keccakf(h0, 24); |
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
} |
|||
|
|||
|
|||
void cryptonight_lite_av3_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
if (size < 43) { |
|||
memset(output, 0, 32); |
|||
return; |
|||
} |
|||
|
|||
keccak(input, size, ctx[0]->state, 200); |
|||
|
|||
VARIANT1_INIT(0); |
|||
|
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) { |
|||
__m128i cx; |
|||
cx = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]); |
|||
cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0)); |
|||
|
|||
cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx)); |
|||
|
|||
idx0 = EXTRACT64(cx); |
|||
bx0 = cx; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0xFFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0xFFFF0])[1] = ah0 ^ tweak1_2_0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
|
|||
keccakf(h0, 24); |
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
} |
@ -1,77 +0,0 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2016-2017 XMRig <support@xmrig.com> |
|||
* |
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "algo/cryptonight/cryptonight.h" |
|||
#include "cryptonight_lite_softaes.h" |
|||
#include "crypto/c_keccak.h" |
|||
|
|||
|
|||
void cryptonight_lite_av3_softaes(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx) |
|||
{ |
|||
keccak((const uint8_t *) input, size, ctx->state0, 200); |
|||
|
|||
cn_explode_scratchpad((__m128i*) ctx->state0, (__m128i*) ctx->memory); |
|||
|
|||
const uint8_t* l0 = ctx->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx->state0; |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) { |
|||
__m128i cx; |
|||
cx = _mm_load_si128((__m128i *)&l0[idx0 & 0xFFFF0]); |
|||
cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0)); |
|||
|
|||
_mm_store_si128((__m128i *)&l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx)); |
|||
idx0 = EXTRACT64(cx); |
|||
bx0 = cx; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*)&l0[idx0 & 0xFFFF0])[0]; |
|||
ch = ((uint64_t*)&l0[idx0 & 0xFFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0xFFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0xFFFF0])[1] = ah0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx->memory, (__m128i*) ctx->state0); |
|||
|
|||
keccakf(h0, 24); |
|||
extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output); |
|||
} |
@ -0,0 +1,202 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
|||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
|||
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "algo/cryptonight/cryptonight.h" |
|||
#include "algo/cryptonight/cryptonight_monero.h" |
|||
#include "cryptonight_lite_softaes.h" |
|||
#include "crypto/c_keccak.h" |
|||
|
|||
|
|||
void cryptonight_lite_av4_v0(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
keccak(input + size, size, ctx[1]->state, 200); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
const uint8_t* l1 = ctx[1]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
uint64_t* h1 = (uint64_t*) ctx[1]->state; |
|||
|
|||
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); |
|||
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t al1 = h1[0] ^ h1[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
uint64_t ah1 = h1[1] ^ h1[5]; |
|||
|
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
uint64_t idx1 = h1[0] ^ h1[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) { |
|||
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]); |
|||
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0xFFFF0]); |
|||
|
|||
cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0)); |
|||
cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1)); |
|||
|
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx0)); |
|||
_mm_store_si128((__m128i *) &l1[idx1 & 0xFFFF0], _mm_xor_si128(bx1, cx1)); |
|||
|
|||
idx0 = EXTRACT64(cx0); |
|||
idx1 = EXTRACT64(cx1); |
|||
|
|||
bx0 = cx0; |
|||
bx1 = cx1; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*) &l0[idx0 & 0xFFFF0])[0] = al0; |
|||
((uint64_t*) &l0[idx0 & 0xFFFF0])[1] = ah0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
|
|||
cl = ((uint64_t*) &l1[idx1 & 0xFFFF0])[0]; |
|||
ch = ((uint64_t*) &l1[idx1 & 0xFFFF0])[1]; |
|||
lo = _umul128(idx1, cl, &hi); |
|||
|
|||
al1 += hi; |
|||
ah1 += lo; |
|||
|
|||
((uint64_t*) &l1[idx1 & 0xFFFF0])[0] = al1; |
|||
((uint64_t*) &l1[idx1 & 0xFFFF0])[1] = ah1; |
|||
|
|||
ah1 ^= ch; |
|||
al1 ^= cl; |
|||
idx1 = al1; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); |
|||
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); |
|||
|
|||
keccakf(h0, 24); |
|||
keccakf(h1, 24); |
|||
|
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); |
|||
} |
|||
|
|||
|
|||
void cryptonight_lite_av4_v1(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
if (size < 43) { |
|||
memset(output, 0, 64); |
|||
return; |
|||
} |
|||
|
|||
keccak(input, size, ctx[0]->state, 200); |
|||
keccak(input + size, size, ctx[1]->state, 200); |
|||
|
|||
VARIANT1_INIT(0); |
|||
VARIANT1_INIT(1); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
const uint8_t* l1 = ctx[1]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
uint64_t* h1 = (uint64_t*) ctx[1]->state; |
|||
|
|||
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); |
|||
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t al1 = h1[0] ^ h1[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
uint64_t ah1 = h1[1] ^ h1[5]; |
|||
|
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
uint64_t idx1 = h1[0] ^ h1[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) { |
|||
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]); |
|||
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0xFFFF0]); |
|||
|
|||
cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0)); |
|||
cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1)); |
|||
|
|||
cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx0)); |
|||
cryptonight_monero_tweak((uint64_t*)&l1[idx1 & 0xFFFF0], _mm_xor_si128(bx1, cx1)); |
|||
|
|||
idx0 = EXTRACT64(cx0); |
|||
idx1 = EXTRACT64(cx1); |
|||
|
|||
bx0 = cx0; |
|||
bx1 = cx1; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*) &l0[idx0 & 0xFFFF0])[0] = al0; |
|||
((uint64_t*) &l0[idx0 & 0xFFFF0])[1] = ah0 ^ tweak1_2_0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
|
|||
cl = ((uint64_t*) &l1[idx1 & 0xFFFF0])[0]; |
|||
ch = ((uint64_t*) &l1[idx1 & 0xFFFF0])[1]; |
|||
lo = _umul128(idx1, cl, &hi); |
|||
|
|||
al1 += hi; |
|||
ah1 += lo; |
|||
|
|||
((uint64_t*) &l1[idx1 & 0xFFFF0])[0] = al1; |
|||
((uint64_t*) &l1[idx1 & 0xFFFF0])[1] = ah1 ^ tweak1_2_1; |
|||
|
|||
ah1 ^= ch; |
|||
al1 ^= cl; |
|||
idx1 = al1; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); |
|||
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); |
|||
|
|||
keccakf(h0, 24); |
|||
keccakf(h1, 24); |
|||
|
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, (char*) output + 32); |
|||
} |
@ -1,111 +0,0 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2016-2017 XMRig <support@xmrig.com> |
|||
* |
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "algo/cryptonight/cryptonight.h" |
|||
#include "cryptonight_lite_softaes.h" |
|||
#include "crypto/c_keccak.h" |
|||
|
|||
|
|||
void cryptonight_lite_av4_softaes_double(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx) |
|||
{ |
|||
keccak((const uint8_t *) input, size, ctx->state0, 200); |
|||
keccak((const uint8_t *) input + size, size, ctx->state1, 200); |
|||
|
|||
const uint8_t* l0 = ctx->memory; |
|||
const uint8_t* l1 = ctx->memory + MEMORY_LITE; |
|||
uint64_t* h0 = (uint64_t*) ctx->state0; |
|||
uint64_t* h1 = (uint64_t*) ctx->state1; |
|||
|
|||
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); |
|||
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t al1 = h1[0] ^ h1[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
uint64_t ah1 = h1[1] ^ h1[5]; |
|||
|
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
uint64_t idx1 = h1[0] ^ h1[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) { |
|||
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]); |
|||
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0xFFFF0]); |
|||
|
|||
cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0)); |
|||
cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1)); |
|||
|
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx0)); |
|||
_mm_store_si128((__m128i *) &l1[idx1 & 0xFFFF0], _mm_xor_si128(bx1, cx1)); |
|||
|
|||
idx0 = EXTRACT64(cx0); |
|||
idx1 = EXTRACT64(cx1); |
|||
|
|||
bx0 = cx0; |
|||
bx1 = cx1; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*) &l0[idx0 & 0xFFFF0])[0] = al0; |
|||
((uint64_t*) &l0[idx0 & 0xFFFF0])[1] = ah0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
|
|||
cl = ((uint64_t*) &l1[idx1 & 0xFFFF0])[0]; |
|||
ch = ((uint64_t*) &l1[idx1 & 0xFFFF0])[1]; |
|||
lo = _umul128(idx1, cl, &hi); |
|||
|
|||
al1 += hi; |
|||
ah1 += lo; |
|||
|
|||
((uint64_t*) &l1[idx1 & 0xFFFF0])[0] = al1; |
|||
((uint64_t*) &l1[idx1 & 0xFFFF0])[1] = ah1; |
|||
|
|||
ah1 ^= ch; |
|||
al1 ^= cl; |
|||
idx1 = al1; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); |
|||
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); |
|||
|
|||
keccakf(h0, 24); |
|||
keccakf(h1, 24); |
|||
|
|||
extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output); |
|||
extra_hashes[ctx->state1[0] & 3](ctx->state1, 200, (char*) output + 32); |
|||
} |
@ -0,0 +1,261 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
|||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
|||
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "crypto/c_keccak.h" |
|||
#include "cryptonight.h" |
|||
#include "cryptonight_aesni.h" |
|||
#include "cryptonight_monero.h" |
|||
|
|||
|
|||
void cryptonight_av1_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
|
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx; |
|||
cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); |
|||
|
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); |
|||
idx0 = EXTRACT64(cx); |
|||
bx0 = cx; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
|
|||
keccakf(h0, 24); |
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
} |
|||
|
|||
|
|||
void cryptonight_av1_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
if (size < 43) { |
|||
memset(output, 0, 32); |
|||
return; |
|||
} |
|||
|
|||
keccak(input, size, ctx[0]->state, 200); |
|||
|
|||
VARIANT1_INIT(0); |
|||
|
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx; |
|||
cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); |
|||
|
|||
cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); |
|||
|
|||
idx0 = EXTRACT64(cx); |
|||
bx0 = cx; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0 ^ tweak1_2_0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
|
|||
keccakf(h0, 24); |
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
} |
|||
|
|||
|
|||
void cryptonight_av1_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
|
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
|
|||
VARIANT2_INIT(0); |
|||
VARIANT2_SET_ROUNDING_MODE(); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); |
|||
|
|||
uint64_t idx0 = al0; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
const __m128i ax0 = _mm_set_epi64x(ah0, al0); |
|||
|
|||
cx = _mm_aesenc_si128(cx, ax0); |
|||
|
|||
VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1); |
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); |
|||
|
|||
idx0 = _mm_cvtsi128_si64(cx); |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
|
|||
VARIANT2_INTEGER_MATH(0, cl, cx); |
|||
lo = _umul128(idx0, cl, &hi); |
|||
VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, hi, lo); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; |
|||
|
|||
al0 ^= cl; |
|||
ah0 ^= ch; |
|||
idx0 = al0; |
|||
|
|||
bx1 = bx0; |
|||
bx0 = cx; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
|
|||
keccakf(h0, 24); |
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
} |
|||
|
|||
|
|||
#ifndef XMRIG_NO_ASM |
|||
extern void cnv2_mainloop_ivybridge_asm(struct cryptonight_ctx *ctx); |
|||
extern void cnv2_mainloop_ryzen_asm(struct cryptonight_ctx *ctx); |
|||
extern void cnv2_mainloop_bulldozer_asm(struct cryptonight_ctx *ctx); |
|||
extern void cnv2_double_mainloop_sandybridge_asm(struct cryptonight_ctx* ctx0, struct cryptonight_ctx* ctx1); |
|||
|
|||
|
|||
void cryptonight_single_hash_asm_intel(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
|
|||
cnv2_mainloop_ivybridge_asm(ctx[0]); |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
keccakf((uint64_t*) ctx[0]->state, 24); |
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
} |
|||
|
|||
|
|||
void cryptonight_single_hash_asm_ryzen(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
|
|||
cnv2_mainloop_ryzen_asm(ctx[0]); |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
keccakf((uint64_t*) ctx[0]->state, 24); |
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
} |
|||
|
|||
|
|||
void cryptonight_single_hash_asm_bulldozer(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
|
|||
cnv2_mainloop_bulldozer_asm(ctx[0]); |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
keccakf((uint64_t*) ctx[0]->state, 24); |
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
} |
|||
|
|||
|
|||
void cryptonight_double_hash_asm(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
keccak(input + size, size, ctx[1]->state, 200); |
|||
|
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
cn_explode_scratchpad((__m128i*) ctx[1]->state, (__m128i*) ctx[1]->memory); |
|||
|
|||
cnv2_double_mainloop_sandybridge_asm(ctx[0], ctx[1]); |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
cn_implode_scratchpad((__m128i*) ctx[1]->memory, (__m128i*) ctx[1]->state); |
|||
|
|||
keccakf((uint64_t*) ctx[0]->state, 24); |
|||
keccakf((uint64_t*) ctx[1]->state, 24); |
|||
|
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); |
|||
} |
|||
#endif |
@ -1,77 +0,0 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2016-2017 XMRig <support@xmrig.com> |
|||
* |
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "cryptonight.h" |
|||
#include "cryptonight_aesni.h" |
|||
#include "crypto/c_keccak.h" |
|||
|
|||
|
|||
void cryptonight_av1_aesni(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx) |
|||
{ |
|||
keccak((const uint8_t *) input, size, ctx->state0, 200); |
|||
|
|||
cn_explode_scratchpad((__m128i*) ctx->state0, (__m128i*) ctx->memory); |
|||
|
|||
const uint8_t* l0 = ctx->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx->state0; |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx; |
|||
cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); |
|||
|
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); |
|||
idx0 = EXTRACT64(cx); |
|||
bx0 = cx; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx->memory, (__m128i*) ctx->state0); |
|||
|
|||
keccakf(h0, 24); |
|||
extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output); |
|||
} |
@ -0,0 +1,304 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
|||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
|||
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "crypto/c_keccak.h" |
|||
#include "cryptonight.h" |
|||
#include "cryptonight_aesni.h" |
|||
#include "cryptonight_monero.h" |
|||
|
|||
|
|||
void cryptonight_av2_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
keccak(input + size, size, ctx[1]->state, 200); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
const uint8_t* l1 = ctx[1]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
uint64_t* h1 = (uint64_t*) ctx[1]->state; |
|||
|
|||
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); |
|||
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t al1 = h1[0] ^ h1[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
uint64_t ah1 = h1[1] ^ h1[5]; |
|||
|
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
uint64_t idx1 = h1[0] ^ h1[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]); |
|||
|
|||
cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); |
|||
cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); |
|||
|
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0)); |
|||
_mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1)); |
|||
|
|||
idx0 = EXTRACT64(cx0); |
|||
idx1 = EXTRACT64(cx1); |
|||
|
|||
bx0 = cx0; |
|||
bx1 = cx1; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
|
|||
cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx1, cl, &hi); |
|||
|
|||
al1 += hi; |
|||
ah1 += lo; |
|||
|
|||
((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1; |
|||
((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1; |
|||
|
|||
ah1 ^= ch; |
|||
al1 ^= cl; |
|||
idx1 = al1; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); |
|||
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); |
|||
|
|||
keccakf(h0, 24); |
|||
keccakf(h1, 24); |
|||
|
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); |
|||
} |
|||
|
|||
|
|||
void cryptonight_av2_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
if (size < 43) { |
|||
memset(output, 0, 64); |
|||
return; |
|||
} |
|||
|
|||
keccak(input, size, ctx[0]->state, 200); |
|||
keccak(input + size, size, ctx[1]->state, 200); |
|||
|
|||
VARIANT1_INIT(0); |
|||
VARIANT1_INIT(1); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
const uint8_t* l1 = ctx[1]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
uint64_t* h1 = (uint64_t*) ctx[1]->state; |
|||
|
|||
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); |
|||
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t al1 = h1[0] ^ h1[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
uint64_t ah1 = h1[1] ^ h1[5]; |
|||
|
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
uint64_t idx1 = h1[0] ^ h1[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]); |
|||
|
|||
cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); |
|||
cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); |
|||
|
|||
cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0)); |
|||
cryptonight_monero_tweak((uint64_t*)&l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1)); |
|||
|
|||
idx0 = EXTRACT64(cx0); |
|||
idx1 = EXTRACT64(cx1); |
|||
|
|||
bx0 = cx0; |
|||
bx1 = cx1; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0 ^ tweak1_2_0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
|
|||
cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx1, cl, &hi); |
|||
|
|||
al1 += hi; |
|||
ah1 += lo; |
|||
|
|||
((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1; |
|||
((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1 ^ tweak1_2_1; |
|||
|
|||
ah1 ^= ch; |
|||
al1 ^= cl; |
|||
idx1 = al1; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); |
|||
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); |
|||
|
|||
keccakf(h0, 24); |
|||
keccakf(h1, 24); |
|||
|
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); |
|||
} |
|||
|
|||
|
|||
void cryptonight_av2_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
keccak(input + size, size, ctx[1]->state, 200); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
const uint8_t* l1 = ctx[1]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
uint64_t* h1 = (uint64_t*) ctx[1]->state; |
|||
|
|||
VARIANT2_INIT(0); |
|||
VARIANT2_INIT(1); |
|||
VARIANT2_SET_ROUNDING_MODE(); |
|||
|
|||
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); |
|||
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t al1 = h1[0] ^ h1[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
uint64_t ah1 = h1[1] ^ h1[5]; |
|||
|
|||
__m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx01 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); |
|||
__m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); |
|||
__m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); |
|||
|
|||
uint64_t idx0 = al0; |
|||
uint64_t idx1 = al1; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]); |
|||
|
|||
const __m128i ax0 = _mm_set_epi64x(ah0, al0); |
|||
const __m128i ax1 = _mm_set_epi64x(ah1, al1); |
|||
|
|||
cx0 = _mm_aesenc_si128(cx0, ax0); |
|||
cx1 = _mm_aesenc_si128(cx1, ax1); |
|||
|
|||
VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01); |
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx00, cx0)); |
|||
|
|||
VARIANT2_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11); |
|||
_mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx10, cx1)); |
|||
|
|||
idx0 = _mm_cvtsi128_si64(cx0); |
|||
idx1 = _mm_cvtsi128_si64(cx1); |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
|
|||
VARIANT2_INTEGER_MATH(0, cl, cx0); |
|||
lo = _umul128(idx0, cl, &hi); |
|||
VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, hi, lo); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; |
|||
|
|||
al0 ^= cl; |
|||
ah0 ^= ch; |
|||
idx0 = al0; |
|||
|
|||
cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1]; |
|||
|
|||
VARIANT2_INTEGER_MATH(1, cl, cx1); |
|||
lo = _umul128(idx1, cl, &hi); |
|||
VARIANT2_SHUFFLE2(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, hi, lo); |
|||
|
|||
al1 += hi; |
|||
ah1 += lo; |
|||
|
|||
((uint64_t*)&l1[idx1 & 0x1FFFF0])[0] = al1; |
|||
((uint64_t*)&l1[idx1 & 0x1FFFF0])[1] = ah1; |
|||
|
|||
al1 ^= cl; |
|||
ah1 ^= ch; |
|||
idx1 = al1; |
|||
|
|||
bx01 = bx00; |
|||
bx11 = bx10; |
|||
|
|||
bx00 = cx0; |
|||
bx10 = cx1; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); |
|||
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); |
|||
|
|||
keccakf(h0, 24); |
|||
keccakf(h1, 24); |
|||
|
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); |
|||
} |
@ -1,111 +0,0 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2016-2017 XMRig <support@xmrig.com> |
|||
* |
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "cryptonight.h" |
|||
#include "cryptonight_aesni.h" |
|||
#include "crypto/c_keccak.h" |
|||
|
|||
|
|||
void cryptonight_av2_aesni_double(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx) |
|||
{ |
|||
keccak((const uint8_t *) input, size, ctx->state0, 200); |
|||
keccak((const uint8_t *) input + size, size, ctx->state1, 200); |
|||
|
|||
const uint8_t* l0 = ctx->memory; |
|||
const uint8_t* l1 = ctx->memory + MEMORY; |
|||
uint64_t* h0 = (uint64_t*) ctx->state0; |
|||
uint64_t* h1 = (uint64_t*) ctx->state1; |
|||
|
|||
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); |
|||
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t al1 = h1[0] ^ h1[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
uint64_t ah1 = h1[1] ^ h1[5]; |
|||
|
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
uint64_t idx1 = h1[0] ^ h1[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]); |
|||
|
|||
cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); |
|||
cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); |
|||
|
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0)); |
|||
_mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1)); |
|||
|
|||
idx0 = EXTRACT64(cx0); |
|||
idx1 = EXTRACT64(cx1); |
|||
|
|||
bx0 = cx0; |
|||
bx1 = cx1; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
|
|||
cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx1, cl, &hi); |
|||
|
|||
al1 += hi; |
|||
ah1 += lo; |
|||
|
|||
((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1; |
|||
((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1; |
|||
|
|||
ah1 ^= ch; |
|||
al1 ^= cl; |
|||
idx1 = al1; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); |
|||
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); |
|||
|
|||
keccakf(h0, 24); |
|||
keccakf(h1, 24); |
|||
|
|||
extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output); |
|||
extra_hashes[ctx->state1[0] & 3](ctx->state1, 200, (char*) output + 32); |
|||
} |
@ -0,0 +1,193 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
|||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
|||
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "crypto/c_keccak.h" |
|||
#include "cryptonight.h" |
|||
#include "cryptonight_monero.h" |
|||
#include "cryptonight_softaes.h" |
|||
|
|||
|
|||
void cryptonight_av3_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
|
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx; |
|||
cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0)); |
|||
|
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); |
|||
idx0 = EXTRACT64(cx); |
|||
bx0 = cx; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
|
|||
keccakf(h0, 24); |
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
} |
|||
|
|||
|
|||
void cryptonight_av3_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
if (size < 43) { |
|||
memset(output, 0, 32); |
|||
return; |
|||
} |
|||
|
|||
keccak(input, size, ctx[0]->state, 200); |
|||
|
|||
VARIANT1_INIT(0); |
|||
|
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx; |
|||
cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0)); |
|||
|
|||
cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); |
|||
|
|||
idx0 = EXTRACT64(cx); |
|||
bx0 = cx; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0 ^ tweak1_2_0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
|
|||
keccakf(h0, 24); |
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
} |
|||
|
|||
|
|||
void cryptonight_av3_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
|
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
|
|||
VARIANT2_INIT(0); |
|||
VARIANT2_SET_ROUNDING_MODE(); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); |
|||
|
|||
uint64_t idx0 = al0; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
const __m128i ax0 = _mm_set_epi64x(ah0, al0); |
|||
|
|||
cx = soft_aesenc(cx, ax0); |
|||
|
|||
VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1); |
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); |
|||
|
|||
idx0 = _mm_cvtsi128_si64(cx); |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
|
|||
VARIANT2_INTEGER_MATH(0, cl, cx); |
|||
lo = _umul128(idx0, cl, &hi); |
|||
VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, hi, lo); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; |
|||
|
|||
al0 ^= cl; |
|||
ah0 ^= ch; |
|||
idx0 = al0; |
|||
|
|||
bx1 = bx0; |
|||
bx0 = cx; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
|
|||
keccakf(h0, 24); |
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
} |
@ -1,77 +0,0 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2016-2017 XMRig <support@xmrig.com> |
|||
* |
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "cryptonight.h" |
|||
#include "cryptonight_softaes.h" |
|||
#include "crypto/c_keccak.h" |
|||
|
|||
|
|||
void cryptonight_av3_softaes(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx) |
|||
{ |
|||
keccak((const uint8_t *) input, size, ctx->state0, 200); |
|||
|
|||
cn_explode_scratchpad((__m128i*) ctx->state0, (__m128i*) ctx->memory); |
|||
|
|||
const uint8_t* l0 = ctx->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx->state0; |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx; |
|||
cx = _mm_load_si128((__m128i *)&l0[idx0 & 0x1FFFF0]); |
|||
cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0)); |
|||
|
|||
_mm_store_si128((__m128i *)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); |
|||
idx0 = EXTRACT64(cx); |
|||
bx0 = cx; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx->memory, (__m128i*) ctx->state0); |
|||
|
|||
keccakf(h0, 24); |
|||
extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output); |
|||
} |
@ -0,0 +1,304 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
|||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
|||
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "crypto/c_keccak.h" |
|||
#include "cryptonight.h" |
|||
#include "cryptonight_monero.h" |
|||
#include "cryptonight_softaes.h" |
|||
|
|||
|
|||
void cryptonight_av4_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
keccak(input + size, size, ctx[1]->state, 200); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
const uint8_t* l1 = ctx[1]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
uint64_t* h1 = (uint64_t*) ctx[1]->state; |
|||
|
|||
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); |
|||
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t al1 = h1[0] ^ h1[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
uint64_t ah1 = h1[1] ^ h1[5]; |
|||
|
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
uint64_t idx1 = h1[0] ^ h1[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]); |
|||
|
|||
cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0)); |
|||
cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1)); |
|||
|
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0)); |
|||
_mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1)); |
|||
|
|||
idx0 = EXTRACT64(cx0); |
|||
idx1 = EXTRACT64(cx1); |
|||
|
|||
bx0 = cx0; |
|||
bx1 = cx1; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
|
|||
cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx1, cl, &hi); |
|||
|
|||
al1 += hi; |
|||
ah1 += lo; |
|||
|
|||
((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1; |
|||
((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1; |
|||
|
|||
ah1 ^= ch; |
|||
al1 ^= cl; |
|||
idx1 = al1; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); |
|||
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); |
|||
|
|||
keccakf(h0, 24); |
|||
keccakf(h1, 24); |
|||
|
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); |
|||
} |
|||
|
|||
|
|||
void cryptonight_av4_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
if (size < 43) { |
|||
memset(output, 0, 64); |
|||
return; |
|||
} |
|||
|
|||
keccak(input, size, ctx[0]->state, 200); |
|||
keccak(input + size, size, ctx[1]->state, 200); |
|||
|
|||
VARIANT1_INIT(0); |
|||
VARIANT1_INIT(1); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
const uint8_t* l1 = ctx[1]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
uint64_t* h1 = (uint64_t*) ctx[1]->state; |
|||
|
|||
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); |
|||
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t al1 = h1[0] ^ h1[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
uint64_t ah1 = h1[1] ^ h1[5]; |
|||
|
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
uint64_t idx1 = h1[0] ^ h1[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]); |
|||
|
|||
cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0)); |
|||
cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1)); |
|||
|
|||
cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0)); |
|||
cryptonight_monero_tweak((uint64_t*)&l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1)); |
|||
|
|||
idx0 = EXTRACT64(cx0); |
|||
idx1 = EXTRACT64(cx1); |
|||
|
|||
bx0 = cx0; |
|||
bx1 = cx1; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0 ^ tweak1_2_0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
|
|||
cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx1, cl, &hi); |
|||
|
|||
al1 += hi; |
|||
ah1 += lo; |
|||
|
|||
((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1; |
|||
((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1 ^ tweak1_2_1; |
|||
|
|||
ah1 ^= ch; |
|||
al1 ^= cl; |
|||
idx1 = al1; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); |
|||
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); |
|||
|
|||
keccakf(h0, 24); |
|||
keccakf(h1, 24); |
|||
|
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); |
|||
} |
|||
|
|||
|
|||
void cryptonight_av4_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
keccak(input + size, size, ctx[1]->state, 200); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
const uint8_t* l1 = ctx[1]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
uint64_t* h1 = (uint64_t*) ctx[1]->state; |
|||
|
|||
VARIANT2_INIT(0); |
|||
VARIANT2_INIT(1); |
|||
VARIANT2_SET_ROUNDING_MODE(); |
|||
|
|||
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); |
|||
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t al1 = h1[0] ^ h1[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
uint64_t ah1 = h1[1] ^ h1[5]; |
|||
|
|||
__m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx01 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); |
|||
__m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); |
|||
__m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); |
|||
|
|||
uint64_t idx0 = al0; |
|||
uint64_t idx1 = al1; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]); |
|||
|
|||
const __m128i ax0 = _mm_set_epi64x(ah0, al0); |
|||
const __m128i ax1 = _mm_set_epi64x(ah1, al1); |
|||
|
|||
cx0 = soft_aesenc(cx0, ax0); |
|||
cx1 = soft_aesenc(cx1, ax1); |
|||
|
|||
VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01); |
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx00, cx0)); |
|||
|
|||
VARIANT2_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11); |
|||
_mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx10, cx1)); |
|||
|
|||
idx0 = _mm_cvtsi128_si64(cx0); |
|||
idx1 = _mm_cvtsi128_si64(cx1); |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
|
|||
VARIANT2_INTEGER_MATH(0, cl, cx0); |
|||
lo = _umul128(idx0, cl, &hi); |
|||
VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, hi, lo); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; |
|||
|
|||
al0 ^= cl; |
|||
ah0 ^= ch; |
|||
idx0 = al0; |
|||
|
|||
cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1]; |
|||
|
|||
VARIANT2_INTEGER_MATH(1, cl, cx1); |
|||
lo = _umul128(idx1, cl, &hi); |
|||
VARIANT2_SHUFFLE2(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, hi, lo); |
|||
|
|||
al1 += hi; |
|||
ah1 += lo; |
|||
|
|||
((uint64_t*)&l1[idx1 & 0x1FFFF0])[0] = al1; |
|||
((uint64_t*)&l1[idx1 & 0x1FFFF0])[1] = ah1; |
|||
|
|||
al1 ^= cl; |
|||
ah1 ^= ch; |
|||
idx1 = al1; |
|||
|
|||
bx01 = bx00; |
|||
bx11 = bx10; |
|||
|
|||
bx00 = cx0; |
|||
bx10 = cx1; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); |
|||
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); |
|||
|
|||
keccakf(h0, 24); |
|||
keccakf(h1, 24); |
|||
|
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); |
|||
} |
@ -1,111 +0,0 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2016-2017 XMRig <support@xmrig.com> |
|||
* |
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "cryptonight.h" |
|||
#include "cryptonight_softaes.h" |
|||
#include "crypto/c_keccak.h" |
|||
|
|||
|
|||
void cryptonight_av4_softaes_double(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx) |
|||
{ |
|||
keccak((const uint8_t *) input, size, ctx->state0, 200); |
|||
keccak((const uint8_t *) input + size, size, ctx->state1, 200); |
|||
|
|||
const uint8_t* l0 = ctx->memory; |
|||
const uint8_t* l1 = ctx->memory + MEMORY; |
|||
uint64_t* h0 = (uint64_t*) ctx->state0; |
|||
uint64_t* h1 = (uint64_t*) ctx->state1; |
|||
|
|||
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); |
|||
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t al1 = h1[0] ^ h1[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
uint64_t ah1 = h1[1] ^ h1[5]; |
|||
|
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); |
|||
|
|||
uint64_t idx0 = h0[0] ^ h0[4]; |
|||
uint64_t idx1 = h1[0] ^ h1[4]; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]); |
|||
|
|||
cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0)); |
|||
cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1)); |
|||
|
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0)); |
|||
_mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1)); |
|||
|
|||
idx0 = EXTRACT64(cx0); |
|||
idx1 = EXTRACT64(cx1); |
|||
|
|||
bx0 = cx0; |
|||
bx1 = cx1; |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx0, cl, &hi); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0; |
|||
|
|||
ah0 ^= ch; |
|||
al0 ^= cl; |
|||
idx0 = al0; |
|||
|
|||
cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1]; |
|||
lo = _umul128(idx1, cl, &hi); |
|||
|
|||
al1 += hi; |
|||
ah1 += lo; |
|||
|
|||
((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1; |
|||
((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1; |
|||
|
|||
ah1 ^= ch; |
|||
al1 ^= cl; |
|||
idx1 = al1; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); |
|||
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); |
|||
|
|||
keccakf(h0, 24); |
|||
keccakf(h1, 24); |
|||
|
|||
extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output); |
|||
extra_hashes[ctx->state1[0] & 3](ctx->state1, 200, (char*) output + 32); |
|||
} |
@ -0,0 +1,150 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
|||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
|||
* Copyright 2018-2019 SChernykh <https://github.com/SChernykh>
|
|||
* Copyright 2016-2019 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#ifndef XMRIG_CRYPTONIGHT_MONERO_H |
|||
#define XMRIG_CRYPTONIGHT_MONERO_H |
|||
|
|||
|
|||
#include <fenv.h> |
|||
#include <math.h> |
|||
#include <stdint.h> |
|||
#include <x86intrin.h> |
|||
|
|||
|
|||
static inline __m128i int_sqrt_v2(const uint64_t n0) |
|||
{ |
|||
__m128d x = _mm_castsi128_pd(_mm_add_epi64(_mm_cvtsi64_si128(n0 >> 12), _mm_set_epi64x(0, 1023ULL << 52))); |
|||
x = _mm_sqrt_sd(_mm_setzero_pd(), x); |
|||
uint64_t r = (uint64_t)(_mm_cvtsi128_si64(_mm_castpd_si128(x))); |
|||
|
|||
const uint64_t s = r >> 20; |
|||
r >>= 19; |
|||
|
|||
uint64_t x2 = (s - (1022ULL << 32)) * (r - s - (1022ULL << 32) + 1); |
|||
# if (defined(_MSC_VER) || __GNUC__ > 7 || (__GNUC__ == 7 && __GNUC_MINOR__ > 1)) && (defined(__x86_64__) || defined(_M_AMD64)) |
|||
_addcarry_u64(_subborrow_u64(0, x2, n0, (unsigned long long int*)&x2), r, 0, (unsigned long long int*)&r); |
|||
# else |
|||
if (x2 < n0) ++r; |
|||
# endif |
|||
|
|||
return _mm_cvtsi64_si128(r); |
|||
} |
|||
|
|||
|
|||
# define VARIANT1_INIT(part) \ |
|||
uint64_t tweak1_2_##part = (*(const uint64_t*)(input + 35 + part * size) ^ \ |
|||
*((const uint64_t*)(ctx[part]->state) + 24)); \ |
|||
|
|||
# define VARIANT2_INIT(part) \ |
|||
__m128i division_result_xmm_##part = _mm_cvtsi64_si128(h##part[12]); \ |
|||
__m128i sqrt_result_xmm_##part = _mm_cvtsi64_si128(h##part[13]); |
|||
|
|||
#ifdef _MSC_VER |
|||
# define VARIANT2_SET_ROUNDING_MODE() { _control87(RC_DOWN, MCW_RC); } |
|||
#else |
|||
# define VARIANT2_SET_ROUNDING_MODE() { fesetround(FE_DOWNWARD); } |
|||
#endif |
|||
|
|||
# define VARIANT2_INTEGER_MATH(part, cl, cx) \ |
|||
{ \ |
|||
const uint64_t sqrt_result = (uint64_t)(_mm_cvtsi128_si64(sqrt_result_xmm_##part)); \ |
|||
const uint64_t cx_0 = _mm_cvtsi128_si64(cx); \ |
|||
cl ^= (uint64_t)(_mm_cvtsi128_si64(division_result_xmm_##part)) ^ (sqrt_result << 32); \ |
|||
const uint32_t d = (uint32_t)(cx_0 + (sqrt_result << 1)) | 0x80000001UL; \ |
|||
const uint64_t cx_1 = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \ |
|||
const uint64_t division_result = (uint32_t)(cx_1 / d) + ((cx_1 % d) << 32); \ |
|||
division_result_xmm_##part = _mm_cvtsi64_si128((int64_t)(division_result)); \ |
|||
sqrt_result_xmm_##part = int_sqrt_v2(cx_0 + division_result); \ |
|||
} |
|||
|
|||
# define VARIANT2_SHUFFLE(base_ptr, offset, _a, _b, _b1) \ |
|||
{ \ |
|||
const __m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))); \ |
|||
const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \ |
|||
const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30))); \ |
|||
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \ |
|||
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \ |
|||
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \ |
|||
} |
|||
|
|||
# define VARIANT4_SHUFFLE(base_ptr, offset, _a, _b, _b1, _c) \ |
|||
{ \ |
|||
const __m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))); \ |
|||
const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \ |
|||
const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30))); \ |
|||
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \ |
|||
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \ |
|||
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \ |
|||
_c = _mm_xor_si128(_mm_xor_si128(_c, chunk3), _mm_xor_si128(chunk1, chunk2)); \ |
|||
} |
|||
|
|||
# define VARIANT2_SHUFFLE2(base_ptr, offset, _a, _b, _b1, hi, lo) \ |
|||
{ \ |
|||
const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))), _mm_set_epi64x(lo, hi)); \ |
|||
const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \ |
|||
hi ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[0]; \ |
|||
lo ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[1]; \ |
|||
const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30))); \ |
|||
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \ |
|||
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \ |
|||
_mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \ |
|||
} |
|||
|
|||
|
|||
#ifndef NOINLINE |
|||
#ifdef __GNUC__ |
|||
#define NOINLINE __attribute__ ((noinline)) |
|||
#elif _MSC_VER |
|||
#define NOINLINE __declspec(noinline) |
|||
#else |
|||
#define NOINLINE |
|||
#endif |
|||
#endif |
|||
|
|||
#include "variant4_random_math.h" |
|||
|
|||
#define VARIANT4_RANDOM_MATH_INIT(part) \ |
|||
uint32_t r##part[9]; \ |
|||
struct V4_Instruction code##part[256]; \ |
|||
{ \ |
|||
r##part[0] = (uint32_t)(h##part[12]); \ |
|||
r##part[1] = (uint32_t)(h##part[12] >> 32); \ |
|||
r##part[2] = (uint32_t)(h##part[13]); \ |
|||
r##part[3] = (uint32_t)(h##part[13] >> 32); \ |
|||
} \ |
|||
v4_random_math_init(code##part, ctx[part]->height); |
|||
|
|||
#define VARIANT4_RANDOM_MATH(part, al, ah, cl, bx0, bx1) \ |
|||
{ \ |
|||
cl ^= (r##part[0] + r##part[1]) | ((uint64_t)(r##part[2] + r##part[3]) << 32); \ |
|||
r##part[4] = (uint32_t)(al); \ |
|||
r##part[5] = (uint32_t)(ah); \ |
|||
r##part[6] = (uint32_t)(_mm_cvtsi128_si32(bx0)); \ |
|||
r##part[7] = (uint32_t)(_mm_cvtsi128_si32(bx1)); \ |
|||
r##part[8] = (uint32_t)(_mm_cvtsi128_si32(_mm_srli_si128(bx1, 8))); \ |
|||
v4_random_math(code##part, r##part); \ |
|||
} |
|||
|
|||
#endif /* XMRIG_CRYPTONIGHT_MONERO_H */ |
@ -0,0 +1,143 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
|||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
|||
* Copyright 2018-2019 SChernykh <https://github.com/SChernykh>
|
|||
* Copyright 2016-2019 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "crypto/c_keccak.h" |
|||
#include "cryptonight.h" |
|||
#include "cryptonight_aesni.h" |
|||
#include "cryptonight_monero.h" |
|||
|
|||
|
|||
void cryptonight_r_av1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
|
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
|
|||
VARIANT2_INIT(0); |
|||
VARIANT2_SET_ROUNDING_MODE(); |
|||
VARIANT4_RANDOM_MATH_INIT(0); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); |
|||
|
|||
uint64_t idx0 = al0; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
const __m128i ax0 = _mm_set_epi64x(ah0, al0); |
|||
|
|||
cx = _mm_aesenc_si128(cx, ax0); |
|||
|
|||
VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, cx); |
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); |
|||
|
|||
idx0 = _mm_cvtsi128_si64(cx); |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
|
|||
VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx0, bx1); |
|||
al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32); |
|||
ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32); |
|||
|
|||
lo = _umul128(idx0, cl, &hi); |
|||
VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, cx); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; |
|||
|
|||
al0 ^= cl; |
|||
ah0 ^= ch; |
|||
idx0 = al0; |
|||
|
|||
bx1 = bx0; |
|||
bx0 = cx; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
|
|||
keccakf(h0, 24); |
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
} |
|||
|
|||
|
|||
#ifndef XMRIG_NO_ASM |
|||
void v4_compile_code(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM); |
|||
|
|||
|
|||
void cryptonight_r_av1_asm_intel(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
if (ctx[0]->generated_code_height != ctx[0]->height) { |
|||
struct V4_Instruction code[256]; |
|||
const int code_size = v4_random_math_init(code, ctx[0]->height); |
|||
|
|||
v4_compile_code(code, code_size, (void*)(ctx[0]->generated_code), ASM_INTEL); |
|||
ctx[0]->generated_code_height = ctx[0]->height; |
|||
} |
|||
|
|||
keccak(input, size, ctx[0]->state, 200); |
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
|
|||
ctx[0]->generated_code(ctx[0]); |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
keccakf((uint64_t*) ctx[0]->state, 24); |
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
} |
|||
|
|||
|
|||
void cryptonight_r_av1_asm_bulldozer(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
if (ctx[0]->generated_code_height != ctx[0]->height) { |
|||
struct V4_Instruction code[256]; |
|||
const int code_size = v4_random_math_init(code, ctx[0]->height); |
|||
|
|||
v4_compile_code(code, code_size, (void*)(ctx[0]->generated_code), ASM_BULLDOZER); |
|||
ctx[0]->generated_code_height = ctx[0]->height; |
|||
} |
|||
|
|||
keccak(input, size, ctx[0]->state, 200); |
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
|
|||
ctx[0]->generated_code(ctx[0]); |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
keccakf((uint64_t*) ctx[0]->state, 24); |
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
} |
|||
#endif |
@ -0,0 +1,202 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
|||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
|||
* Copyright 2018-2019 SChernykh <https://github.com/SChernykh>
|
|||
* Copyright 2016-2019 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "crypto/c_keccak.h" |
|||
#include "cryptonight.h" |
|||
#include "cryptonight_aesni.h" |
|||
#include "cryptonight_monero.h" |
|||
|
|||
|
|||
void cryptonight_r_av2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
keccak(input + size, size, ctx[1]->state, 200); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
const uint8_t* l1 = ctx[1]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
uint64_t* h1 = (uint64_t*) ctx[1]->state; |
|||
|
|||
VARIANT2_INIT(0); |
|||
VARIANT2_INIT(1); |
|||
VARIANT2_SET_ROUNDING_MODE(); |
|||
VARIANT4_RANDOM_MATH_INIT(0); |
|||
VARIANT4_RANDOM_MATH_INIT(1); |
|||
|
|||
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); |
|||
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t al1 = h1[0] ^ h1[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
uint64_t ah1 = h1[1] ^ h1[5]; |
|||
|
|||
__m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx01 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); |
|||
__m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); |
|||
__m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); |
|||
|
|||
uint64_t idx0 = al0; |
|||
uint64_t idx1 = al1; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]); |
|||
|
|||
const __m128i ax0 = _mm_set_epi64x(ah0, al0); |
|||
const __m128i ax1 = _mm_set_epi64x(ah1, al1); |
|||
|
|||
cx0 = _mm_aesenc_si128(cx0, ax0); |
|||
cx1 = _mm_aesenc_si128(cx1, ax1); |
|||
|
|||
VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, cx0); |
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx00, cx0)); |
|||
|
|||
VARIANT4_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, cx1); |
|||
_mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx10, cx1)); |
|||
|
|||
idx0 = _mm_cvtsi128_si64(cx0); |
|||
idx1 = _mm_cvtsi128_si64(cx1); |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
|
|||
VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx01); |
|||
al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32); |
|||
ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32); |
|||
|
|||
lo = _umul128(idx0, cl, &hi); |
|||
VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, cx0); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; |
|||
|
|||
al0 ^= cl; |
|||
ah0 ^= ch; |
|||
idx0 = al0; |
|||
|
|||
cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1]; |
|||
|
|||
VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx10, bx11); |
|||
al1 ^= r1[2] | ((uint64_t)(r1[3]) << 32); |
|||
ah1 ^= r1[0] | ((uint64_t)(r1[1]) << 32); |
|||
|
|||
lo = _umul128(idx1, cl, &hi); |
|||
VARIANT4_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, cx1); |
|||
|
|||
al1 += hi; |
|||
ah1 += lo; |
|||
|
|||
((uint64_t*)&l1[idx1 & 0x1FFFF0])[0] = al1; |
|||
((uint64_t*)&l1[idx1 & 0x1FFFF0])[1] = ah1; |
|||
|
|||
al1 ^= cl; |
|||
ah1 ^= ch; |
|||
idx1 = al1; |
|||
|
|||
bx01 = bx00; |
|||
bx11 = bx10; |
|||
|
|||
bx00 = cx0; |
|||
bx10 = cx1; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); |
|||
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); |
|||
|
|||
keccakf(h0, 24); |
|||
keccakf(h1, 24); |
|||
|
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); |
|||
} |
|||
|
|||
|
|||
#ifndef XMRIG_NO_ASM |
|||
void v4_compile_code_double(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM); |
|||
|
|||
|
|||
void cryptonight_r_av2_asm_intel(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
if (ctx[0]->generated_code_height != ctx[0]->height) { |
|||
struct V4_Instruction code[256]; |
|||
const int code_size = v4_random_math_init(code, ctx[0]->height); |
|||
v4_compile_code_double(code, code_size, (void*)(ctx[0]->generated_code_double), ASM_INTEL); |
|||
ctx[0]->generated_code_height = ctx[0]->height; |
|||
} |
|||
|
|||
keccak(input, size, ctx[0]->state, 200); |
|||
keccak(input + size, size, ctx[1]->state, 200); |
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
cn_explode_scratchpad((__m128i*) ctx[1]->state, (__m128i*) ctx[1]->memory); |
|||
|
|||
ctx[0]->generated_code_double(ctx[0], ctx[1]); |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
cn_implode_scratchpad((__m128i*) ctx[1]->memory, (__m128i*) ctx[1]->state); |
|||
|
|||
keccakf((uint64_t *) ctx[0]->state, 24); |
|||
keccakf((uint64_t *) ctx[1]->state, 24); |
|||
|
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); |
|||
} |
|||
|
|||
|
|||
void cryptonight_r_av2_asm_bulldozer(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
if (ctx[0]->generated_code_height != ctx[0]->height) { |
|||
struct V4_Instruction code[256]; |
|||
const int code_size = v4_random_math_init(code, ctx[0]->height); |
|||
v4_compile_code_double(code, code_size, (void*)(ctx[0]->generated_code_double), ASM_BULLDOZER); |
|||
ctx[0]->generated_code_height = ctx[0]->height; |
|||
} |
|||
|
|||
keccak(input, size, ctx[0]->state, 200); |
|||
keccak(input + size, size, ctx[1]->state, 200); |
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
cn_explode_scratchpad((__m128i*) ctx[1]->state, (__m128i*) ctx[1]->memory); |
|||
|
|||
ctx[0]->generated_code_double(ctx[0], ctx[1]); |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
cn_implode_scratchpad((__m128i*) ctx[1]->memory, (__m128i*) ctx[1]->state); |
|||
|
|||
keccakf((uint64_t *) ctx[0]->state, 24); |
|||
keccakf((uint64_t *) ctx[1]->state, 24); |
|||
|
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); |
|||
} |
|||
#endif |
@ -0,0 +1,112 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
|||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
|||
* Copyright 2018-2019 SChernykh <https://github.com/SChernykh>
|
|||
* Copyright 2016-2019 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "crypto/c_keccak.h" |
|||
#include "cryptonight.h" |
|||
#include "cryptonight_monero.h" |
|||
#include "cryptonight_softaes.h" |
|||
|
|||
|
|||
#ifndef XMRIG_NO_ASM |
|||
void v4_soft_aes_compile_code(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM); |
|||
#endif |
|||
|
|||
|
|||
void cryptonight_r_av3(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); |
|||
|
|||
# ifndef XMRIG_NO_ASM |
|||
if (ctx[0]->generated_code_height != ctx[0]->height) { |
|||
struct V4_Instruction code[256]; |
|||
const int code_size = v4_random_math_init(code, ctx[0]->height); |
|||
|
|||
v4_soft_aes_compile_code(code, code_size, (void*)(ctx[0]->generated_code), ASM_NONE); |
|||
ctx[0]->generated_code_height = ctx[0]->height; |
|||
} |
|||
|
|||
ctx[0]->saes_table = (const uint32_t*)saes_table; |
|||
ctx[0]->generated_code(ctx[0]); |
|||
# else |
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
|
|||
VARIANT2_INIT(0); |
|||
VARIANT2_SET_ROUNDING_MODE(); |
|||
VARIANT4_RANDOM_MATH_INIT(0); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); |
|||
|
|||
uint64_t idx0 = al0; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
const __m128i ax0 = _mm_set_epi64x(ah0, al0); |
|||
|
|||
cx = soft_aesenc(cx, ax0); |
|||
|
|||
VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, cx); |
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); |
|||
|
|||
idx0 = _mm_cvtsi128_si64(cx); |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
|
|||
VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx0, bx1); |
|||
al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32); |
|||
ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32); |
|||
|
|||
lo = _umul128(idx0, cl, &hi); |
|||
VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, cx); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; |
|||
|
|||
al0 ^= cl; |
|||
ah0 ^= ch; |
|||
idx0 = al0; |
|||
|
|||
bx1 = bx0; |
|||
bx0 = cx; |
|||
} |
|||
# endif |
|||
|
|||
cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); |
|||
keccakf((uint64_t *) ctx[0]->state, 24); |
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
} |
@ -0,0 +1,143 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017 fireice-uk <https://github.com/fireice-uk>
|
|||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
|||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
|||
* Copyright 2018-2019 SChernykh <https://github.com/SChernykh>
|
|||
* Copyright 2016-2019 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <x86intrin.h> |
|||
#include <string.h> |
|||
|
|||
#include "crypto/c_keccak.h" |
|||
#include "cryptonight.h" |
|||
#include "cryptonight_monero.h" |
|||
#include "cryptonight_softaes.h" |
|||
|
|||
|
|||
void cryptonight_r_av4(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) |
|||
{ |
|||
keccak(input, size, ctx[0]->state, 200); |
|||
keccak(input + size, size, ctx[1]->state, 200); |
|||
|
|||
const uint8_t* l0 = ctx[0]->memory; |
|||
const uint8_t* l1 = ctx[1]->memory; |
|||
uint64_t* h0 = (uint64_t*) ctx[0]->state; |
|||
uint64_t* h1 = (uint64_t*) ctx[1]->state; |
|||
|
|||
VARIANT2_INIT(0); |
|||
VARIANT2_INIT(1); |
|||
VARIANT2_SET_ROUNDING_MODE(); |
|||
VARIANT4_RANDOM_MATH_INIT(0); |
|||
VARIANT4_RANDOM_MATH_INIT(1); |
|||
|
|||
cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); |
|||
cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); |
|||
|
|||
uint64_t al0 = h0[0] ^ h0[4]; |
|||
uint64_t al1 = h1[0] ^ h1[4]; |
|||
uint64_t ah0 = h0[1] ^ h0[5]; |
|||
uint64_t ah1 = h1[1] ^ h1[5]; |
|||
|
|||
__m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); |
|||
__m128i bx01 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); |
|||
__m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); |
|||
__m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); |
|||
|
|||
uint64_t idx0 = al0; |
|||
uint64_t idx1 = al1; |
|||
|
|||
for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { |
|||
__m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); |
|||
__m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]); |
|||
|
|||
const __m128i ax0 = _mm_set_epi64x(ah0, al0); |
|||
const __m128i ax1 = _mm_set_epi64x(ah1, al1); |
|||
|
|||
cx0 = soft_aesenc(cx0, ax0); |
|||
cx1 = soft_aesenc(cx1, ax1); |
|||
|
|||
VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, cx0); |
|||
_mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx00, cx0)); |
|||
|
|||
VARIANT4_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, cx1); |
|||
_mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx10, cx1)); |
|||
|
|||
idx0 = _mm_cvtsi128_si64(cx0); |
|||
idx1 = _mm_cvtsi128_si64(cx1); |
|||
|
|||
uint64_t hi, lo, cl, ch; |
|||
cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; |
|||
|
|||
VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx01); |
|||
al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32); |
|||
ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32); |
|||
|
|||
lo = _umul128(idx0, cl, &hi); |
|||
VARIANT4_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, cx0); |
|||
|
|||
al0 += hi; |
|||
ah0 += lo; |
|||
|
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; |
|||
((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; |
|||
|
|||
al0 ^= cl; |
|||
ah0 ^= ch; |
|||
idx0 = al0; |
|||
|
|||
cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0]; |
|||
ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1]; |
|||
|
|||
VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx10, bx11); |
|||
al1 ^= r1[2] | ((uint64_t)(r1[3]) << 32); |
|||
ah1 ^= r1[0] | ((uint64_t)(r1[1]) << 32); |
|||
|
|||
lo = _umul128(idx1, cl, &hi); |
|||
VARIANT4_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, cx1); |
|||
|
|||
al1 += hi; |
|||
ah1 += lo; |
|||
|
|||
((uint64_t*)&l1[idx1 & 0x1FFFF0])[0] = al1; |
|||
((uint64_t*)&l1[idx1 & 0x1FFFF0])[1] = ah1; |
|||
|
|||
al1 ^= cl; |
|||
ah1 ^= ch; |
|||
idx1 = al1; |
|||
|
|||
bx01 = bx00; |
|||
bx11 = bx10; |
|||
|
|||
bx00 = cx0; |
|||
bx10 = cx1; |
|||
} |
|||
|
|||
cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); |
|||
cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); |
|||
|
|||
keccakf(h0, 24); |
|||
keccakf(h1, 24); |
|||
|
|||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); |
|||
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); |
|||
} |
@ -0,0 +1,129 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
|||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
|||
* Copyright 2018-2019 SChernykh <https://github.com/SChernykh>
|
|||
* Copyright 2016-2019 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#ifndef XMRIG_CRYPTONIGHT_TEST_H |
|||
#define XMRIG_CRYPTONIGHT_TEST_H |
|||
|
|||
|
|||
#include <stdint.h> |
|||
|
|||
|
|||
const static uint8_t test_input[152] = { |
|||
0x03, 0x05, 0xA0, 0xDB, 0xD6, 0xBF, 0x05, 0xCF, 0x16, 0xE5, 0x03, 0xF3, 0xA6, 0x6F, 0x78, 0x00, |
|||
0x7C, 0xBF, 0x34, 0x14, 0x43, 0x32, 0xEC, 0xBF, 0xC2, 0x2E, 0xD9, 0x5C, 0x87, 0x00, 0x38, 0x3B, |
|||
0x30, 0x9A, 0xCE, 0x19, 0x23, 0xA0, 0x96, 0x4B, 0x00, 0x00, 0x00, 0x08, 0xBA, 0x93, 0x9A, 0x62, |
|||
0x72, 0x4C, 0x0D, 0x75, 0x81, 0xFC, 0xE5, 0x76, 0x1E, 0x9D, 0x8A, 0x0E, 0x6A, 0x1C, 0x3F, 0x92, |
|||
0x4F, 0xDD, 0x84, 0x93, 0xD1, 0x11, 0x56, 0x49, 0xC0, 0x5E, 0xB6, 0x01, |
|||
0x01, 0x00, 0xFB, 0x8E, 0x8A, 0xC8, 0x05, 0x89, 0x93, 0x23, 0x37, 0x1B, 0xB7, 0x90, 0xDB, 0x19, |
|||
0x21, 0x8A, 0xFD, 0x8D, 0xB8, 0xE3, 0x75, 0x5D, 0x8B, 0x90, 0xF3, 0x9B, 0x3D, 0x55, 0x06, 0xA9, |
|||
0xAB, 0xCE, 0x4F, 0xA9, 0x12, 0x24, 0x45, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x81, 0x46, 0xD4, 0x9F, |
|||
0xA9, 0x3E, 0xE7, 0x24, 0xDE, 0xB5, 0x7D, 0x12, 0xCB, 0xC6, 0xC6, 0xF3, 0xB9, 0x24, 0xD9, 0x46, |
|||
0x12, 0x7C, 0x7A, 0x97, 0x41, 0x8F, 0x93, 0x48, 0x82, 0x8F, 0x0F, 0x02 |
|||
}; |
|||
|
|||
|
|||
const static uint8_t test_output_v0[64] = { |
|||
0x1A, 0x3F, 0xFB, 0xEE, 0x90, 0x9B, 0x42, 0x0D, 0x91, 0xF7, 0xBE, 0x6E, 0x5F, 0xB5, 0x6D, 0xB7, |
|||
0x1B, 0x31, 0x10, 0xD8, 0x86, 0x01, 0x1E, 0x87, 0x7E, 0xE5, 0x78, 0x6A, 0xFD, 0x08, 0x01, 0x00, |
|||
0x1B, 0x60, 0x6A, 0x3F, 0x4A, 0x07, 0xD6, 0x48, 0x9A, 0x1B, 0xCD, 0x07, 0x69, 0x7B, 0xD1, 0x66, |
|||
0x96, 0xB6, 0x1C, 0x8A, 0xE9, 0x82, 0xF6, 0x1A, 0x90, 0x16, 0x0F, 0x4E, 0x52, 0x82, 0x8A, 0x7F |
|||
}; |
|||
|
|||
|
|||
// Cryptonight variant 1 (Monero v7)
|
|||
const static uint8_t test_output_v1[64] = { |
|||
0xF2, 0x2D, 0x3D, 0x62, 0x03, 0xD2, 0xA0, 0x8B, 0x41, 0xD9, 0x02, 0x72, 0x78, 0xD8, 0xBC, 0xC9, |
|||
0x83, 0xAC, 0xAD, 0xA9, 0xB6, 0x8E, 0x52, 0xE3, 0xC6, 0x89, 0x69, 0x2A, 0x50, 0xE9, 0x21, 0xD9, |
|||
0xC9, 0xFA, 0xE8, 0x42, 0x5D, 0x86, 0x88, 0xDC, 0x23, 0x6B, 0xCD, 0xBC, 0x42, 0xFD, 0xB4, 0x2D, |
|||
0x37, 0x6C, 0x6E, 0xC1, 0x90, 0x50, 0x1A, 0xA8, 0x4B, 0x04, 0xA4, 0xB4, 0xCF, 0x1E, 0xE1, 0x22 |
|||
}; |
|||
|
|||
|
|||
// Cryptonight variant 2 (Monero v8)
|
|||
const static uint8_t test_output_v2[64] = { |
|||
0x97, 0x37, 0x82, 0x82, 0xCF, 0x10, 0xE7, 0xAD, 0x03, 0x3F, 0x7B, 0x80, 0x74, 0xC4, 0x0E, 0x14, |
|||
0xD0, 0x6E, 0x7F, 0x60, 0x9D, 0xDD, 0xDA, 0x78, 0x76, 0x80, 0xB5, 0x8C, 0x05, 0xF4, 0x3D, 0x21, |
|||
0x87, 0x1F, 0xCD, 0x68, 0x23, 0xF6, 0xA8, 0x79, 0xBB, 0x3F, 0x33, 0x95, 0x1C, 0x8E, 0x8E, 0x89, |
|||
0x1D, 0x40, 0x43, 0x88, 0x0B, 0x02, 0xDF, 0xA1, 0xBB, 0x3B, 0xE4, 0x98, 0xB5, 0x0E, 0x75, 0x78 |
|||
}; |
|||
|
|||
|
|||
struct cn_r_test_input_data |
|||
{ |
|||
uint64_t height; |
|||
size_t size; |
|||
uint8_t data[64]; |
|||
}; |
|||
|
|||
|
|||
const static struct cn_r_test_input_data cn_r_test_input[] = { |
|||
{ 1806260, 44, { 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74 } }, |
|||
{ 1806261, 50, { 0x4c, 0x6f, 0x72, 0x65, 0x6d, 0x20, 0x69, 0x70, 0x73, 0x75, 0x6d, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x20, 0x73, 0x69, 0x74, 0x20, 0x61, 0x6d, 0x65, 0x74, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x65, 0x63, 0x74, 0x65, 0x74, 0x75, 0x72, 0x20, 0x61, 0x64, 0x69, 0x70, 0x69, 0x73, 0x63, 0x69, 0x6e, 0x67 } }, |
|||
{ 1806262, 48, { 0x65, 0x6c, 0x69, 0x74, 0x2c, 0x20, 0x73, 0x65, 0x64, 0x20, 0x64, 0x6f, 0x20, 0x65, 0x69, 0x75, 0x73, 0x6d, 0x6f, 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6f, 0x72, 0x20, 0x69, 0x6e, 0x63, 0x69, 0x64, 0x69, 0x64, 0x75, 0x6e, 0x74, 0x20, 0x75, 0x74, 0x20, 0x6c, 0x61, 0x62, 0x6f, 0x72, 0x65 } }, |
|||
{ 1806263, 48, { 0x65, 0x74, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x65, 0x20, 0x6d, 0x61, 0x67, 0x6e, 0x61, 0x20, 0x61, 0x6c, 0x69, 0x71, 0x75, 0x61, 0x2e, 0x20, 0x55, 0x74, 0x20, 0x65, 0x6e, 0x69, 0x6d, 0x20, 0x61, 0x64, 0x20, 0x6d, 0x69, 0x6e, 0x69, 0x6d, 0x20, 0x76, 0x65, 0x6e, 0x69, 0x61, 0x6d, 0x2c } }, |
|||
{ 1806264, 46, { 0x71, 0x75, 0x69, 0x73, 0x20, 0x6e, 0x6f, 0x73, 0x74, 0x72, 0x75, 0x64, 0x20, 0x65, 0x78, 0x65, 0x72, 0x63, 0x69, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x75, 0x6c, 0x6c, 0x61, 0x6d, 0x63, 0x6f, 0x20, 0x6c, 0x61, 0x62, 0x6f, 0x72, 0x69, 0x73, 0x20, 0x6e, 0x69, 0x73, 0x69 } }, |
|||
{ 1806265, 45, { 0x75, 0x74, 0x20, 0x61, 0x6c, 0x69, 0x71, 0x75, 0x69, 0x70, 0x20, 0x65, 0x78, 0x20, 0x65, 0x61, 0x20, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x64, 0x6f, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x65, 0x71, 0x75, 0x61, 0x74, 0x2e, 0x20, 0x44, 0x75, 0x69, 0x73, 0x20, 0x61, 0x75, 0x74, 0x65 } }, |
|||
{ 1806266, 47, { 0x69, 0x72, 0x75, 0x72, 0x65, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x20, 0x69, 0x6e, 0x20, 0x72, 0x65, 0x70, 0x72, 0x65, 0x68, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x69, 0x74, 0x20, 0x69, 0x6e, 0x20, 0x76, 0x6f, 0x6c, 0x75, 0x70, 0x74, 0x61, 0x74, 0x65, 0x20, 0x76, 0x65, 0x6c, 0x69, 0x74 } }, |
|||
{ 1806267, 44, { 0x65, 0x73, 0x73, 0x65, 0x20, 0x63, 0x69, 0x6c, 0x6c, 0x75, 0x6d, 0x20, 0x64, 0x6f, 0x6c, 0x6f, 0x72, 0x65, 0x20, 0x65, 0x75, 0x20, 0x66, 0x75, 0x67, 0x69, 0x61, 0x74, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x61, 0x20, 0x70, 0x61, 0x72, 0x69, 0x61, 0x74, 0x75, 0x72, 0x2e } }, |
|||
{ 1806268, 47, { 0x45, 0x78, 0x63, 0x65, 0x70, 0x74, 0x65, 0x75, 0x72, 0x20, 0x73, 0x69, 0x6e, 0x74, 0x20, 0x6f, 0x63, 0x63, 0x61, 0x65, 0x63, 0x61, 0x74, 0x20, 0x63, 0x75, 0x70, 0x69, 0x64, 0x61, 0x74, 0x61, 0x74, 0x20, 0x6e, 0x6f, 0x6e, 0x20, 0x70, 0x72, 0x6f, 0x69, 0x64, 0x65, 0x6e, 0x74, 0x2c } }, |
|||
{ 1806269, 62, { 0x73, 0x75, 0x6e, 0x74, 0x20, 0x69, 0x6e, 0x20, 0x63, 0x75, 0x6c, 0x70, 0x61, 0x20, 0x71, 0x75, 0x69, 0x20, 0x6f, 0x66, 0x66, 0x69, 0x63, 0x69, 0x61, 0x20, 0x64, 0x65, 0x73, 0x65, 0x72, 0x75, 0x6e, 0x74, 0x20, 0x6d, 0x6f, 0x6c, 0x6c, 0x69, 0x74, 0x20, 0x61, 0x6e, 0x69, 0x6d, 0x20, 0x69, 0x64, 0x20, 0x65, 0x73, 0x74, 0x20, 0x6c, 0x61, 0x62, 0x6f, 0x72, 0x75, 0x6d, 0x2e } }, |
|||
}; |
|||
|
|||
|
|||
// "cn/r"
|
|||
const static uint8_t test_output_r[] = { |
|||
0xf7, 0x59, 0x58, 0x8a, 0xd5, 0x7e, 0x75, 0x84, 0x67, 0x29, 0x54, 0x43, 0xa9, 0xbd, 0x71, 0x49, 0x0a, 0xbf, 0xf8, 0xe9, 0xda, 0xd1, 0xb9, 0x5b, 0x6b, 0xf2, 0xf5, 0xd0, 0xd7, 0x83, 0x87, 0xbc, |
|||
0x5b, 0xb8, 0x33, 0xde, 0xca, 0x2b, 0xdd, 0x72, 0x52, 0xa9, 0xcc, 0xd7, 0xb4, 0xce, 0x0b, 0x6a, 0x48, 0x54, 0x51, 0x57, 0x94, 0xb5, 0x6c, 0x20, 0x72, 0x62, 0xf7, 0xa5, 0xb9, 0xbd, 0xb5, 0x66, |
|||
0x1e, 0xe6, 0x72, 0x8d, 0xa6, 0x0f, 0xbd, 0x8d, 0x7d, 0x55, 0xb2, 0xb1, 0xad, 0xe4, 0x87, 0xa3, 0xcf, 0x52, 0xa2, 0xc3, 0xac, 0x6f, 0x52, 0x0d, 0xb1, 0x2c, 0x27, 0xd8, 0x92, 0x1f, 0x6c, 0xab, |
|||
0x69, 0x69, 0xfe, 0x2d, 0xdf, 0xb7, 0x58, 0x43, 0x8d, 0x48, 0x04, 0x9f, 0x30, 0x2f, 0xc2, 0x10, 0x8a, 0x4f, 0xcc, 0x93, 0xe3, 0x76, 0x69, 0x17, 0x0e, 0x6d, 0xb4, 0xb0, 0xb9, 0xb4, 0xc4, 0xcb, |
|||
0x7f, 0x30, 0x48, 0xb4, 0xe9, 0x0d, 0x0c, 0xbe, 0x7a, 0x57, 0xc0, 0x39, 0x4f, 0x37, 0x33, 0x8a, 0x01, 0xfa, 0xe3, 0xad, 0xfd, 0xc0, 0xe5, 0x12, 0x6d, 0x86, 0x3a, 0x89, 0x5e, 0xb0, 0x4e, 0x02, |
|||
0x1d, 0x29, 0x04, 0x43, 0xa4, 0xb5, 0x42, 0xaf, 0x04, 0xa8, 0x2f, 0x6b, 0x24, 0x94, 0xa6, 0xee, 0x7f, 0x20, 0xf2, 0x75, 0x4c, 0x58, 0xe0, 0x84, 0x90, 0x32, 0x48, 0x3a, 0x56, 0xe8, 0xe2, 0xef, |
|||
0xc4, 0x3c, 0xc6, 0x56, 0x74, 0x36, 0xa8, 0x6a, 0xfb, 0xd6, 0xaa, 0x9e, 0xaa, 0x7c, 0x27, 0x6e, 0x98, 0x06, 0x83, 0x03, 0x34, 0xb6, 0x14, 0xb2, 0xbe, 0xe2, 0x3c, 0xc7, 0x66, 0x34, 0xf6, 0xfd, |
|||
0x87, 0xbe, 0x24, 0x79, 0xc0, 0xc4, 0xe8, 0xed, 0xfd, 0xfa, 0xa5, 0x60, 0x3e, 0x93, 0xf4, 0x26, 0x5b, 0x3f, 0x82, 0x24, 0xc1, 0xc5, 0x94, 0x6f, 0xeb, 0x42, 0x48, 0x19, 0xd1, 0x89, 0x90, 0xa4, |
|||
0xdd, 0x9d, 0x6a, 0x6d, 0x8e, 0x47, 0x46, 0x5c, 0xce, 0xac, 0x08, 0x77, 0xef, 0x88, 0x9b, 0x93, 0xe7, 0xeb, 0xa9, 0x79, 0x55, 0x7e, 0x39, 0x35, 0xd7, 0xf8, 0x6d, 0xce, 0x11, 0xb0, 0x70, 0xf3, |
|||
0x75, 0xc6, 0xf2, 0xae, 0x49, 0xa2, 0x05, 0x21, 0xde, 0x97, 0x28, 0x5b, 0x43, 0x1e, 0x71, 0x71, 0x25, 0x84, 0x7f, 0xb8, 0x93, 0x5e, 0xd8, 0x4a, 0x61, 0xe7, 0xf8, 0xd3, 0x6a, 0x2c, 0x3d, 0x8e, |
|||
}; |
|||
|
|||
|
|||
#ifndef XMRIG_NO_AEON |
|||
const static uint8_t test_output_v0_lite[64] = { |
|||
0x36, 0x95, 0xB4, 0xB5, 0x3B, 0xB0, 0x03, 0x58, 0xB0, 0xAD, 0x38, 0xDC, 0x16, 0x0F, 0xEB, 0x9E, |
|||
0x00, 0x4E, 0xEC, 0xE0, 0x9B, 0x83, 0xA7, 0x2E, 0xF6, 0xBA, 0x98, 0x64, 0xD3, 0x51, 0x0C, 0x88, |
|||
0x28, 0xA2, 0x2B, 0xAD, 0x3F, 0x93, 0xD1, 0x40, 0x8F, 0xCA, 0x47, 0x2E, 0xB5, 0xAD, 0x1C, 0xBE, |
|||
0x75, 0xF2, 0x1D, 0x05, 0x3C, 0x8C, 0xE5, 0xB3, 0xAF, 0x10, 0x5A, 0x57, 0x71, 0x3E, 0x21, 0xDD |
|||
}; |
|||
|
|||
|
|||
// AEON v7
|
|||
const static uint8_t test_output_v1_lite[64] = { |
|||
0x6D, 0x8C, 0xDC, 0x44, 0x4E, 0x9B, 0xBB, 0xFD, 0x68, 0xFC, 0x43, 0xFC, 0xD4, 0x85, 0x5B, 0x22, |
|||
0x8C, 0x8A, 0x1B, 0xD9, 0x1D, 0x9D, 0x00, 0x28, 0x5B, 0xEC, 0x02, 0xB7, 0xCA, 0x2D, 0x67, 0x41, |
|||
0x87, 0xC4, 0xE5, 0x70, 0x65, 0x3E, 0xB4, 0xC2, 0xB4, 0x2B, 0x7A, 0x0D, 0x54, 0x65, 0x59, 0x45, |
|||
0x2D, 0xFA, 0xB5, 0x73, 0xB8, 0x2E, 0xC5, 0x2F, 0x15, 0x2B, 0x7F, 0xF9, 0x8E, 0x79, 0x44, 0x6F |
|||
}; |
|||
#endif |
|||
|
|||
|
|||
#endif /* XMRIG_CRYPTONIGHT_TEST_H */ |
@ -0,0 +1,449 @@ |
|||
#ifndef VARIANT4_RANDOM_MATH_H |
|||
#define VARIANT4_RANDOM_MATH_H |
|||
|
|||
|
|||
#include <stdint.h> |
|||
#include <string.h> |
|||
#include <stdbool.h> |
|||
|
|||
|
|||
#include "crypto/c_blake256.h" |
|||
|
|||
|
|||
enum V4_Settings |
|||
{ |
|||
// Generate code with minimal theoretical latency = 45 cycles, which is equivalent to 15 multiplications
|
|||
TOTAL_LATENCY = 15 * 3, |
|||
|
|||
// Always generate at least 60 instructions
|
|||
NUM_INSTRUCTIONS_MIN = 60, |
|||
|
|||
// Never generate more than 70 instructions (final RET instruction doesn't count here)
|
|||
NUM_INSTRUCTIONS_MAX = 70, |
|||
|
|||
// Available ALUs for MUL
|
|||
// Modern CPUs typically have only 1 ALU which can do multiplications
|
|||
ALU_COUNT_MUL = 1, |
|||
|
|||
// Total available ALUs
|
|||
// Modern CPUs have 4 ALUs, but we use only 3 because random math executes together with other main loop code
|
|||
ALU_COUNT = 3, |
|||
}; |
|||
|
|||
enum V4_InstructionList |
|||
{ |
|||
MUL, // a*b
|
|||
ADD, // a+b + C, C is an unsigned 32-bit constant
|
|||
SUB, // a-b
|
|||
ROR, // rotate right "a" by "b & 31" bits
|
|||
ROL, // rotate left "a" by "b & 31" bits
|
|||
XOR, // a^b
|
|||
RET, // finish execution
|
|||
V4_INSTRUCTION_COUNT = RET, |
|||
}; |
|||
|
|||
// V4_InstructionDefinition is used to generate code from random data
|
|||
// Every random sequence of bytes is a valid code
|
|||
//
|
|||
// There are 9 registers in total:
|
|||
// - 4 variable registers
|
|||
// - 5 constant registers initialized from loop variables
|
|||
// This is why dst_index is 2 bits
|
|||
enum V4_InstructionDefinition |
|||
{ |
|||
V4_OPCODE_BITS = 3, |
|||
V4_DST_INDEX_BITS = 2, |
|||
V4_SRC_INDEX_BITS = 3, |
|||
}; |
|||
|
|||
struct V4_Instruction |
|||
{ |
|||
uint8_t opcode; |
|||
uint8_t dst_index; |
|||
uint8_t src_index; |
|||
uint32_t C; |
|||
}; |
|||
|
|||
#ifndef FORCEINLINE |
|||
#ifdef __GNUC__ |
|||
#define FORCEINLINE __attribute__((always_inline)) inline |
|||
#elif _MSC_VER |
|||
#define FORCEINLINE __forceinline |
|||
#else |
|||
#define FORCEINLINE inline |
|||
#endif |
|||
#endif |
|||
|
|||
#ifndef UNREACHABLE_CODE |
|||
#ifdef __GNUC__ |
|||
#define UNREACHABLE_CODE __builtin_unreachable() |
|||
#elif _MSC_VER |
|||
#define UNREACHABLE_CODE __assume(false) |
|||
#else |
|||
#define UNREACHABLE_CODE |
|||
#endif |
|||
#endif |
|||
|
|||
#define SWAP32LE(x) x |
|||
#define SWAP64LE(x) x |
|||
#define hash_extra_blake(data, length, hash) blake256_hash((uint8_t*)(hash), (uint8_t*)(data), (length)) |
|||
|
|||
// Random math interpreter's loop is fully unrolled and inlined to achieve 100% branch prediction on CPU:
|
|||
// every switch-case will point to the same destination on every iteration of Cryptonight main loop
|
|||
//
|
|||
// This is about as fast as it can get without using low-level machine code generation
|
|||
//template<typename v4_reg>
|
|||
static void v4_random_math(const struct V4_Instruction* code, uint32_t r[9]) |
|||
{ |
|||
#define REG_BITS 32 |
|||
#define V4_EXEC(i) \ |
|||
{ \ |
|||
const struct V4_Instruction* op = code + i; \ |
|||
const uint32_t src = r[op->src_index]; \ |
|||
uint32_t *dst = r + op->dst_index; \ |
|||
switch (op->opcode) \ |
|||
{ \ |
|||
case MUL: \ |
|||
*dst *= src; \ |
|||
break; \ |
|||
case ADD: \ |
|||
*dst += src + op->C; \ |
|||
break; \ |
|||
case SUB: \ |
|||
*dst -= src; \ |
|||
break; \ |
|||
case ROR: \ |
|||
{ \ |
|||
const uint32_t shift = src % REG_BITS; \ |
|||
*dst = (*dst >> shift) | (*dst << ((REG_BITS - shift) % REG_BITS)); \ |
|||
} \ |
|||
break; \ |
|||
case ROL: \ |
|||
{ \ |
|||
const uint32_t shift = src % REG_BITS; \ |
|||
*dst = (*dst << shift) | (*dst >> ((REG_BITS - shift) % REG_BITS)); \ |
|||
} \ |
|||
break; \ |
|||
case XOR: \ |
|||
*dst ^= src; \ |
|||
break; \ |
|||
case RET: \ |
|||
return; \ |
|||
default: \ |
|||
UNREACHABLE_CODE; \ |
|||
break; \ |
|||
} \ |
|||
} |
|||
|
|||
#define V4_EXEC_10(j) \ |
|||
V4_EXEC(j + 0) \ |
|||
V4_EXEC(j + 1) \ |
|||
V4_EXEC(j + 2) \ |
|||
V4_EXEC(j + 3) \ |
|||
V4_EXEC(j + 4) \ |
|||
V4_EXEC(j + 5) \ |
|||
V4_EXEC(j + 6) \ |
|||
V4_EXEC(j + 7) \ |
|||
V4_EXEC(j + 8) \ |
|||
V4_EXEC(j + 9) |
|||
|
|||
// Generated program can have 60 + a few more (usually 2-3) instructions to achieve required latency
|
|||
// I've checked all block heights < 10,000,000 and here is the distribution of program sizes:
|
|||
//
|
|||
// 60 27960
|
|||
// 61 105054
|
|||
// 62 2452759
|
|||
// 63 5115997
|
|||
// 64 1022269
|
|||
// 65 1109635
|
|||
// 66 153145
|
|||
// 67 8550
|
|||
// 68 4529
|
|||
// 69 102
|
|||
|
|||
// Unroll 70 instructions here
|
|||
V4_EXEC_10(0); // instructions 0-9
|
|||
V4_EXEC_10(10); // instructions 10-19
|
|||
V4_EXEC_10(20); // instructions 20-29
|
|||
V4_EXEC_10(30); // instructions 30-39
|
|||
V4_EXEC_10(40); // instructions 40-49
|
|||
V4_EXEC_10(50); // instructions 50-59
|
|||
V4_EXEC_10(60); // instructions 60-69
|
|||
|
|||
#undef V4_EXEC_10 |
|||
#undef V4_EXEC |
|||
#undef REG_BITS |
|||
} |
|||
|
|||
// If we don't have enough data available, generate more
|
|||
static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed, int8_t* data, const size_t data_size) |
|||
{ |
|||
if (*data_index + bytes_needed > data_size) |
|||
{ |
|||
hash_extra_blake(data, data_size, (char*) data); |
|||
*data_index = 0; |
|||
} |
|||
} |
|||
|
|||
// Generates as many random math operations as possible with given latency and ALU restrictions
|
|||
// "code" array must have space for NUM_INSTRUCTIONS_MAX+1 instructions
|
|||
static int v4_random_math_init(struct V4_Instruction* code, const uint64_t height) |
|||
{ |
|||
// MUL is 3 cycles, 3-way addition and rotations are 2 cycles, SUB/XOR are 1 cycle
|
|||
// These latencies match real-life instruction latencies for Intel CPUs starting from Sandy Bridge and up to Skylake/Coffee lake
|
|||
//
|
|||
// AMD Ryzen has the same latencies except 1-cycle ROR/ROL, so it'll be a bit faster than Intel Sandy Bridge and newer processors
|
|||
// Surprisingly, Intel Nehalem also has 1-cycle ROR/ROL, so it'll also be faster than Intel Sandy Bridge and newer processors
|
|||
// AMD Bulldozer has 4 cycles latency for MUL (slower than Intel) and 1 cycle for ROR/ROL (faster than Intel), so average performance will be the same
|
|||
// Source: https://www.agner.org/optimize/instruction_tables.pdf
|
|||
const int op_latency[V4_INSTRUCTION_COUNT] = { 3, 2, 1, 2, 2, 1 }; |
|||
|
|||
// Instruction latencies for theoretical ASIC implementation
|
|||
const int asic_op_latency[V4_INSTRUCTION_COUNT] = { 3, 1, 1, 1, 1, 1 }; |
|||
|
|||
// Available ALUs for each instruction
|
|||
const int op_ALUs[V4_INSTRUCTION_COUNT] = { ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT }; |
|||
|
|||
int8_t data[32]; |
|||
memset(data, 0, sizeof(data)); |
|||
uint64_t tmp = SWAP64LE(height); |
|||
memcpy(data, &tmp, sizeof(uint64_t)); |
|||
data[20] = -38; |
|||
|
|||
// Set data_index past the last byte in data
|
|||
// to trigger full data update with blake hash
|
|||
// before we start using it
|
|||
size_t data_index = sizeof(data); |
|||
|
|||
int code_size; |
|||
|
|||
// There is a small chance (1.8%) that register R8 won't be used in the generated program
|
|||
// So we keep track of it and try again if it's not used
|
|||
bool r8_used; |
|||
do { |
|||
int latency[9]; |
|||
int asic_latency[9]; |
|||
|
|||
// Tracks previous instruction and value of the source operand for registers R0-R3 throughout code execution
|
|||
// byte 0: current value of the destination register
|
|||
// byte 1: instruction opcode
|
|||
// byte 2: current value of the source register
|
|||
//
|
|||
// Registers R4-R8 are constant and are treated as having the same value because when we do
|
|||
// the same operation twice with two constant source registers, it can be optimized into a single operation
|
|||
uint32_t inst_data[9] = { 0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF }; |
|||
|
|||
bool alu_busy[TOTAL_LATENCY + 1][ALU_COUNT]; |
|||
bool is_rotation[V4_INSTRUCTION_COUNT]; |
|||
bool rotated[4]; |
|||
int rotate_count = 0; |
|||
|
|||
memset(latency, 0, sizeof(latency)); |
|||
memset(asic_latency, 0, sizeof(asic_latency)); |
|||
memset(alu_busy, 0, sizeof(alu_busy)); |
|||
memset(is_rotation, 0, sizeof(is_rotation)); |
|||
memset(rotated, 0, sizeof(rotated)); |
|||
is_rotation[ROR] = true; |
|||
is_rotation[ROL] = true; |
|||
|
|||
int num_retries = 0; |
|||
code_size = 0; |
|||
|
|||
int total_iterations = 0; |
|||
r8_used = false; |
|||
|
|||
// Generate random code to achieve minimal required latency for our abstract CPU
|
|||
// Try to get this latency for all 4 registers
|
|||
while (((latency[0] < TOTAL_LATENCY) || (latency[1] < TOTAL_LATENCY) || (latency[2] < TOTAL_LATENCY) || (latency[3] < TOTAL_LATENCY)) && (num_retries < 64)) |
|||
{ |
|||
// Fail-safe to guarantee loop termination
|
|||
++total_iterations; |
|||
if (total_iterations > 256) |
|||
break; |
|||
|
|||
check_data(&data_index, 1, data, sizeof(data)); |
|||
|
|||
const uint8_t c = ((uint8_t*)data)[data_index++]; |
|||
|
|||
// MUL = opcodes 0-2
|
|||
// ADD = opcode 3
|
|||
// SUB = opcode 4
|
|||
// ROR/ROL = opcode 5, shift direction is selected randomly
|
|||
// XOR = opcodes 6-7
|
|||
uint8_t opcode = c & ((1 << V4_OPCODE_BITS) - 1); |
|||
if (opcode == 5) |
|||
{ |
|||
check_data(&data_index, 1, data, sizeof(data)); |
|||
opcode = (data[data_index++] >= 0) ? ROR : ROL; |
|||
} |
|||
else if (opcode >= 6) |
|||
{ |
|||
opcode = XOR; |
|||
} |
|||
else |
|||
{ |
|||
opcode = (opcode <= 2) ? MUL : (opcode - 2); |
|||
} |
|||
|
|||
uint8_t dst_index = (c >> V4_OPCODE_BITS) & ((1 << V4_DST_INDEX_BITS) - 1); |
|||
uint8_t src_index = (c >> (V4_OPCODE_BITS + V4_DST_INDEX_BITS)) & ((1 << V4_SRC_INDEX_BITS) - 1); |
|||
|
|||
const int a = dst_index; |
|||
int b = src_index; |
|||
|
|||
// Don't do ADD/SUB/XOR with the same register
|
|||
if (((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b)) |
|||
{ |
|||
// a is always < 4, so we don't need to check bounds here
|
|||
b = 8; |
|||
src_index = b; |
|||
} |
|||
|
|||
// Don't do rotation with the same destination twice because it's equal to a single rotation
|
|||
if (is_rotation[opcode] && rotated[a]) |
|||
{ |
|||
continue; |
|||
} |
|||
|
|||
// Don't do the same instruction (except MUL) with the same source value twice because all other cases can be optimized:
|
|||
// 2xADD(a, b, C) = ADD(a, b*2, C1+C2), same for SUB and rotations
|
|||
// 2xXOR(a, b) = NOP
|
|||
if ((opcode != MUL) && ((inst_data[a] & 0xFFFF00) == (opcode << 8) + ((inst_data[b] & 255) << 16))) |
|||
{ |
|||
continue; |
|||
} |
|||
|
|||
// Find which ALU is available (and when) for this instruction
|
|||
int next_latency = (latency[a] > latency[b]) ? latency[a] : latency[b]; |
|||
int alu_index = -1; |
|||
while (next_latency < TOTAL_LATENCY) |
|||
{ |
|||
for (int i = op_ALUs[opcode] - 1; i >= 0; --i) |
|||
{ |
|||
if (!alu_busy[next_latency][i]) |
|||
{ |
|||
// ADD is implemented as two 1-cycle instructions on a real CPU, so do an additional availability check
|
|||
if ((opcode == ADD) && alu_busy[next_latency + 1][i]) |
|||
{ |
|||
continue; |
|||
} |
|||
|
|||
// Rotation can only start when previous rotation is finished, so do an additional availability check
|
|||
if (is_rotation[opcode] && (next_latency < rotate_count * op_latency[opcode])) |
|||
{ |
|||
continue; |
|||
} |
|||
|
|||
alu_index = i; |
|||
break; |
|||
} |
|||
} |
|||
if (alu_index >= 0) |
|||
{ |
|||
break; |
|||
} |
|||
++next_latency; |
|||
} |
|||
|
|||
// Don't generate instructions that leave some register unchanged for more than 7 cycles
|
|||
if (next_latency > latency[a] + 7) |
|||
{ |
|||
continue; |
|||
} |
|||
|
|||
next_latency += op_latency[opcode]; |
|||
|
|||
if (next_latency <= TOTAL_LATENCY) |
|||
{ |
|||
if (is_rotation[opcode]) |
|||
{ |
|||
++rotate_count; |
|||
} |
|||
|
|||
// Mark ALU as busy only for the first cycle when it starts executing the instruction because ALUs are fully pipelined
|
|||
alu_busy[next_latency - op_latency[opcode]][alu_index] = true; |
|||
latency[a] = next_latency; |
|||
|
|||
// ASIC is supposed to have enough ALUs to run as many independent instructions per cycle as possible, so latency calculation for ASIC is simple
|
|||
asic_latency[a] = ((asic_latency[a] > asic_latency[b]) ? asic_latency[a] : asic_latency[b]) + asic_op_latency[opcode]; |
|||
|
|||
rotated[a] = is_rotation[opcode]; |
|||
|
|||
inst_data[a] = code_size + (opcode << 8) + ((inst_data[b] & 255) << 16); |
|||
|
|||
code[code_size].opcode = opcode; |
|||
code[code_size].dst_index = dst_index; |
|||
code[code_size].src_index = src_index; |
|||
code[code_size].C = 0; |
|||
|
|||
if (src_index == 8) |
|||
{ |
|||
r8_used = true; |
|||
} |
|||
|
|||
if (opcode == ADD) |
|||
{ |
|||
// ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too
|
|||
alu_busy[next_latency - op_latency[opcode] + 1][alu_index] = true; |
|||
|
|||
// ADD instruction requires 4 more random bytes for 32-bit constant "C" in "a = a + b + C"
|
|||
check_data(&data_index, sizeof(uint32_t), data, sizeof(data)); |
|||
uint32_t t; |
|||
memcpy(&t, data + data_index, sizeof(uint32_t)); |
|||
code[code_size].C = SWAP32LE(t); |
|||
data_index += sizeof(uint32_t); |
|||
} |
|||
|
|||
++code_size; |
|||
if (code_size >= NUM_INSTRUCTIONS_MIN) |
|||
{ |
|||
break; |
|||
} |
|||
} |
|||
else |
|||
{ |
|||
++num_retries; |
|||
} |
|||
} |
|||
|
|||
// ASIC has more execution resources and can extract as much parallelism from the code as possible
|
|||
// We need to add a few more MUL and ROR instructions to achieve minimal required latency for ASIC
|
|||
// Get this latency for at least 1 of the 4 registers
|
|||
const int prev_code_size = code_size; |
|||
while ((code_size < NUM_INSTRUCTIONS_MAX) && (asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY)) |
|||
{ |
|||
int min_idx = 0; |
|||
int max_idx = 0; |
|||
for (int i = 1; i < 4; ++i) |
|||
{ |
|||
if (asic_latency[i] < asic_latency[min_idx]) min_idx = i; |
|||
if (asic_latency[i] > asic_latency[max_idx]) max_idx = i; |
|||
} |
|||
|
|||
const uint8_t pattern[3] = { ROR, MUL, MUL }; |
|||
const uint8_t opcode = pattern[(code_size - prev_code_size) % 3]; |
|||
latency[min_idx] = latency[max_idx] + op_latency[opcode]; |
|||
asic_latency[min_idx] = asic_latency[max_idx] + asic_op_latency[opcode]; |
|||
|
|||
code[code_size].opcode = opcode; |
|||
code[code_size].dst_index = min_idx; |
|||
code[code_size].src_index = max_idx; |
|||
code[code_size].C = 0; |
|||
++code_size; |
|||
} |
|||
|
|||
// There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time
|
|||
// It never does more than 4 iterations for all block heights < 10,000,000
|
|||
} while (!r8_used || (code_size < NUM_INSTRUCTIONS_MIN) || (code_size > NUM_INSTRUCTIONS_MAX)); |
|||
|
|||
// It's guaranteed that NUM_INSTRUCTIONS_MIN <= code_size <= NUM_INSTRUCTIONS_MAX here
|
|||
// Add final instruction to stop the interpreter
|
|||
code[code_size].opcode = RET; |
|||
code[code_size].dst_index = 0; |
|||
code[code_size].src_index = 0; |
|||
code[code_size].C = 0; |
|||
|
|||
return code_size; |
|||
} |
|||
|
|||
#endif |
@ -0,0 +1,27 @@ |
|||
if (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8) |
|||
set(XMRIG_ASM_LIBRARY "xmrig-asm") |
|||
|
|||
enable_language(ASM) |
|||
|
|||
if (WIN32 AND CMAKE_C_COMPILER_ID MATCHES GNU) |
|||
set(XMRIG_ASM_FILES |
|||
"crypto/asm/win64/cn_main_loop.S" |
|||
"crypto/asm/CryptonightR_template.S" |
|||
) |
|||
else() |
|||
set(XMRIG_ASM_FILES |
|||
"crypto/asm/cn_main_loop.S" |
|||
"crypto/asm/CryptonightR_template.S" |
|||
) |
|||
endif() |
|||
|
|||
set_property(SOURCE ${XMRIG_ASM_FILES} PROPERTY C) |
|||
|
|||
add_library(${XMRIG_ASM_LIBRARY} STATIC ${XMRIG_ASM_FILES}) |
|||
set(XMRIG_ASM_SOURCES "crypto/CryptonightR_gen.c") |
|||
set_property(TARGET ${XMRIG_ASM_LIBRARY} PROPERTY LINKER_LANGUAGE C) |
|||
else() |
|||
set(XMRIG_ASM_SOURCES "") |
|||
set(XMRIG_ASM_LIBRARY "") |
|||
add_definitions(/DXMRIG_NO_ASM) |
|||
endif() |
@ -0,0 +1,146 @@ |
|||
/* XMRig
|
|||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com> |
|||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org> |
|||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com> |
|||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
|||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
|||
* Copyright 2018-2019 SChernykh <https://github.com/SChernykh>
|
|||
* Copyright 2016-2019 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
|||
* |
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* (at your option) any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
*/ |
|||
|
|||
#include <string.h> |
|||
|
|||
#include "algo/cryptonight/cryptonight_monero.h" |
|||
#include "crypto/asm/CryptonightR_template.h" |
|||
#include "persistent_memory.h" |
|||
|
|||
|
|||
static inline void add_code(uint8_t **p, void (*p1)(), void (*p2)()) |
|||
{ |
|||
const ptrdiff_t size = (const uint8_t*)(p2) - (const uint8_t*)(p1); |
|||
if (size > 0) { |
|||
memcpy(*p, (const void *) p1, size); |
|||
*p += size; |
|||
} |
|||
} |
|||
|
|||
|
|||
static inline void add_random_math(uint8_t **p, const struct V4_Instruction* code, int code_size, const void_func* instructions, const void_func* instructions_mov, bool is_64_bit, enum Assembly ASM) |
|||
{ |
|||
uint32_t prev_rot_src = (uint32_t)(-1); |
|||
|
|||
for (int i = 0;; ++i) { |
|||
const struct V4_Instruction inst = code[i]; |
|||
if (inst.opcode == RET) { |
|||
break; |
|||
} |
|||
|
|||
uint8_t opcode = (inst.opcode == MUL) ? inst.opcode : (inst.opcode + 2); |
|||
uint8_t dst_index = inst.dst_index; |
|||
uint8_t src_index = inst.src_index; |
|||
|
|||
const uint32_t a = inst.dst_index; |
|||
const uint32_t b = inst.src_index; |
|||
const uint8_t c = opcode | (dst_index << V4_OPCODE_BITS) | (((src_index == 8) ? dst_index : src_index) << (V4_OPCODE_BITS + V4_DST_INDEX_BITS)); |
|||
|
|||
switch (inst.opcode) { |
|||
case ROR: |
|||
case ROL: |
|||
if (b != prev_rot_src) { |
|||
prev_rot_src = b; |
|||
add_code(p, instructions_mov[c], instructions_mov[c + 1]); |
|||
} |
|||
break; |
|||
} |
|||
|
|||
if (a == prev_rot_src) { |
|||
prev_rot_src = (uint32_t)(-1); |
|||
} |
|||
|
|||
void_func begin = instructions[c]; |
|||
|
|||
if ((ASM = ASM_BULLDOZER) && (inst.opcode == MUL) && !is_64_bit) { |
|||
// AMD Bulldozer has latency 4 for 32-bit IMUL and 6 for 64-bit IMUL
|
|||
// Always use 32-bit IMUL for AMD Bulldozer in 32-bit mode - skip prefix 0x48 and change 0x49 to 0x41
|
|||
uint8_t* prefix = (uint8_t*) begin; |
|||
|
|||
if (*prefix == 0x49) { |
|||
**p = 0x41; |
|||
*p += 1; |
|||
} |
|||
|
|||
begin = (void_func)(prefix + 1); |
|||
} |
|||
|
|||
add_code(p, begin, instructions[c + 1]); |
|||
|
|||
if (inst.opcode == ADD) { |
|||
*(uint32_t*)(*p - sizeof(uint32_t) - (is_64_bit ? 3 : 0)) = inst.C; |
|||
if (is_64_bit) { |
|||
prev_rot_src = (uint32_t)(-1); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
|
|||
void v4_compile_code(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM) |
|||
{ |
|||
uint8_t* p0 = machine_code; |
|||
uint8_t* p = p0; |
|||
|
|||
add_code(&p, CryptonightR_template_part1, CryptonightR_template_part2); |
|||
add_random_math(&p, code, code_size, instructions, instructions_mov, false, ASM); |
|||
add_code(&p, CryptonightR_template_part2, CryptonightR_template_part3); |
|||
*(int*)(p - 4) = (int)((((const uint8_t*)CryptonightR_template_mainloop) - ((const uint8_t*)CryptonightR_template_part1)) - (p - p0)); |
|||
add_code(&p, CryptonightR_template_part3, CryptonightR_template_end); |
|||
|
|||
flush_instruction_cache(machine_code, p - p0); |
|||
} |
|||
|
|||
|
|||
void v4_compile_code_double(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM) |
|||
{ |
|||
uint8_t* p0 = (uint8_t*) machine_code; |
|||
uint8_t* p = p0; |
|||
|
|||
add_code(&p, CryptonightR_template_double_part1, CryptonightR_template_double_part2); |
|||
add_random_math(&p, code, code_size, instructions, instructions_mov, false, ASM); |
|||
add_code(&p, CryptonightR_template_double_part2, CryptonightR_template_double_part3); |
|||
add_random_math(&p, code, code_size, instructions, instructions_mov, false, ASM); |
|||
add_code(&p, CryptonightR_template_double_part3, CryptonightR_template_double_part4); |
|||
*(int*)(p - 4) = (int)((((const uint8_t*)CryptonightR_template_double_mainloop) - ((const uint8_t*)CryptonightR_template_double_part1)) - (p - p0)); |
|||
add_code(&p, CryptonightR_template_double_part4, CryptonightR_template_double_end); |
|||
|
|||
flush_instruction_cache(machine_code, p - p0); |
|||
} |
|||
|
|||
|
|||
void v4_soft_aes_compile_code(const struct V4_Instruction* code, int code_size, void* machine_code, enum Assembly ASM) |
|||
{ |
|||
uint8_t* p0 = machine_code; |
|||
uint8_t* p = p0; |
|||
|
|||
add_code(&p, CryptonightR_soft_aes_template_part1, CryptonightR_soft_aes_template_part2); |
|||
add_random_math(&p, code, code_size, instructions, instructions_mov, false, ASM); |
|||
add_code(&p, CryptonightR_soft_aes_template_part2, CryptonightR_soft_aes_template_part3); |
|||
*(int*)(p - 4) = (int)((((const uint8_t*)CryptonightR_soft_aes_template_mainloop) - ((const uint8_t*)CryptonightR_soft_aes_template_part1)) - (p - p0)); |
|||
add_code(&p, CryptonightR_soft_aes_template_part3, CryptonightR_soft_aes_template_end); |
|||
|
|||
flush_instruction_cache(machine_code, p - p0); |
|||
} |
@ -0,0 +1,279 @@ |
|||
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part1) |
|||
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_mainloop) |
|||
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part2) |
|||
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part3) |
|||
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_end) |
|||
|
|||
ALIGN(64) |
|||
FN_PREFIX(CryptonightR_soft_aes_template_part1): |
|||
mov QWORD PTR [rsp+8], rcx |
|||
push rbx |
|||
push rbp |
|||
push rsi |
|||
push rdi |
|||
push r12 |
|||
push r13 |
|||
push r14 |
|||
push r15 |
|||
sub rsp, 232 |
|||
|
|||
mov eax, [rcx+96] |
|||
mov ebx, [rcx+100] |
|||
mov esi, [rcx+104] |
|||
mov edx, [rcx+108] |
|||
mov [rsp+144], eax |
|||
mov [rsp+148], ebx |
|||
mov [rsp+152], esi |
|||
mov [rsp+156], edx |
|||
|
|||
mov rax, QWORD PTR [rcx+48] |
|||
mov r10, rcx |
|||
xor rax, QWORD PTR [rcx+16] |
|||
mov r8, QWORD PTR [rcx+32] |
|||
xor r8, QWORD PTR [rcx] |
|||
mov r9, QWORD PTR [rcx+40] |
|||
xor r9, QWORD PTR [rcx+8] |
|||
movq xmm4, rax |
|||
mov rdx, QWORD PTR [rcx+56] |
|||
xor rdx, QWORD PTR [rcx+24] |
|||
mov r11, QWORD PTR [rcx+224] |
|||
mov rcx, QWORD PTR [rcx+88] |
|||
xor rcx, QWORD PTR [r10+72] |
|||
mov rax, QWORD PTR [r10+80] |
|||
movq xmm0, rdx |
|||
xor rax, QWORD PTR [r10+64] |
|||
|
|||
movaps XMMWORD PTR [rsp+16], xmm6 |
|||
movaps XMMWORD PTR [rsp+32], xmm7 |
|||
movaps XMMWORD PTR [rsp+48], xmm8 |
|||
movaps XMMWORD PTR [rsp+64], xmm9 |
|||
movaps XMMWORD PTR [rsp+80], xmm10 |
|||
movaps XMMWORD PTR [rsp+96], xmm11 |
|||
movaps XMMWORD PTR [rsp+112], xmm12 |
|||
movaps XMMWORD PTR [rsp+128], xmm13 |
|||
|
|||
movq xmm5, rax |
|||
|
|||
mov rax, r8 |
|||
punpcklqdq xmm4, xmm0 |
|||
and eax, 2097136 |
|||
movq xmm10, QWORD PTR [r10+96] |
|||
movq xmm0, rcx |
|||
mov rcx, QWORD PTR [r10+104] |
|||
xorps xmm9, xmm9 |
|||
mov QWORD PTR [rsp+328], rax |
|||
movq xmm12, r11 |
|||
mov QWORD PTR [rsp+320], r9 |
|||
punpcklqdq xmm5, xmm0 |
|||
movq xmm13, rcx |
|||
mov r12d, 524288 |
|||
|
|||
ALIGN(64) |
|||
FN_PREFIX(CryptonightR_soft_aes_template_mainloop): |
|||
movd xmm11, r12d |
|||
mov r12, QWORD PTR [r10+272] |
|||
lea r13, QWORD PTR [rax+r11] |
|||
mov esi, DWORD PTR [r13] |
|||
movq xmm0, r9 |
|||
mov r10d, DWORD PTR [r13+4] |
|||
movq xmm7, r8 |
|||
mov ebp, DWORD PTR [r13+12] |
|||
mov r14d, DWORD PTR [r13+8] |
|||
mov rdx, QWORD PTR [rsp+328] |
|||
movzx ecx, sil |
|||
shr esi, 8 |
|||
punpcklqdq xmm7, xmm0 |
|||
mov r15d, DWORD PTR [r12+rcx*4] |
|||
movzx ecx, r10b |
|||
shr r10d, 8 |
|||
mov edi, DWORD PTR [r12+rcx*4] |
|||
movzx ecx, r14b |
|||
shr r14d, 8 |
|||
mov ebx, DWORD PTR [r12+rcx*4] |
|||
movzx ecx, bpl |
|||
shr ebp, 8 |
|||
mov r9d, DWORD PTR [r12+rcx*4] |
|||
movzx ecx, r10b |
|||
shr r10d, 8 |
|||
xor r15d, DWORD PTR [r12+rcx*4+1024] |
|||
movzx ecx, r14b |
|||
shr r14d, 8 |
|||
mov eax, r14d |
|||
shr eax, 8 |
|||
xor edi, DWORD PTR [r12+rcx*4+1024] |
|||
add eax, 256 |
|||
movzx ecx, bpl |
|||
shr ebp, 8 |
|||
xor ebx, DWORD PTR [r12+rcx*4+1024] |
|||
movzx ecx, sil |
|||
shr esi, 8 |
|||
xor r9d, DWORD PTR [r12+rcx*4+1024] |
|||
add r12, 2048 |
|||
movzx ecx, r10b |
|||
shr r10d, 8 |
|||
add r10d, 256 |
|||
mov r11d, DWORD PTR [r12+rax*4] |
|||
xor r11d, DWORD PTR [r12+rcx*4] |
|||
xor r11d, r9d |
|||
movzx ecx, sil |
|||
mov r10d, DWORD PTR [r12+r10*4] |
|||
shr esi, 8 |
|||
add esi, 256 |
|||
xor r10d, DWORD PTR [r12+rcx*4] |
|||
movzx ecx, bpl |
|||
xor r10d, ebx |
|||
shr ebp, 8 |
|||
movd xmm1, r11d |
|||
add ebp, 256 |
|||
movq r11, xmm12 |
|||
mov r9d, DWORD PTR [r12+rcx*4] |
|||
xor r9d, DWORD PTR [r12+rsi*4] |
|||
mov eax, DWORD PTR [r12+rbp*4] |
|||
xor r9d, edi |
|||
movzx ecx, r14b |
|||
movd xmm0, r10d |
|||
movd xmm2, r9d |
|||
xor eax, DWORD PTR [r12+rcx*4] |
|||
mov rcx, rdx |
|||
xor eax, r15d |
|||
punpckldq xmm2, xmm1 |
|||
xor rcx, 16 |
|||
movd xmm6, eax |
|||
mov rax, rdx |
|||
punpckldq xmm6, xmm0 |
|||
xor rax, 32 |
|||
punpckldq xmm6, xmm2 |
|||
xor rdx, 48 |
|||
movdqu xmm2, XMMWORD PTR [rcx+r11] |
|||
pxor xmm6, xmm2 |
|||
pxor xmm6, xmm7 |
|||
paddq xmm2, xmm4 |
|||
movdqu xmm1, XMMWORD PTR [rax+r11] |
|||
movdqu xmm0, XMMWORD PTR [rdx+r11] |
|||
pxor xmm6, xmm1 |
|||
pxor xmm6, xmm0 |
|||
paddq xmm0, xmm5 |
|||
movdqu XMMWORD PTR [rcx+r11], xmm0 |
|||
movdqu XMMWORD PTR [rax+r11], xmm2 |
|||
movq rcx, xmm13 |
|||
paddq xmm1, xmm7 |
|||
movdqu XMMWORD PTR [rdx+r11], xmm1 |
|||
movq rdi, xmm6 |
|||
mov r10, rdi |
|||
and r10d, 2097136 |
|||
movdqa xmm0, xmm6 |
|||
pxor xmm0, xmm4 |
|||
movdqu XMMWORD PTR [r13], xmm0 |
|||
|
|||
mov ebx, [rsp+144] |
|||
mov ebp, [rsp+152] |
|||
add ebx, [rsp+148] |
|||
add ebp, [rsp+156] |
|||
shl rbp, 32 |
|||
or rbx, rbp |
|||
|
|||
xor rbx, QWORD PTR [r10+r11] |
|||
lea r14, QWORD PTR [r10+r11] |
|||
mov rbp, QWORD PTR [r14+8] |
|||
|
|||
mov [rsp+160], rbx |
|||
mov [rsp+168], rdi |
|||
mov [rsp+176], rbp |
|||
mov [rsp+184], r10 |
|||
mov r10, rsp |
|||
|
|||
mov ebx, [rsp+144] |
|||
mov esi, [rsp+148] |
|||
mov edi, [rsp+152] |
|||
mov ebp, [rsp+156] |
|||
|
|||
movd esp, xmm7 |
|||
movaps xmm0, xmm7 |
|||
psrldq xmm0, 8 |
|||
movd r15d, xmm0 |
|||
movd eax, xmm4 |
|||
movd edx, xmm5 |
|||
movaps xmm0, xmm5 |
|||
psrldq xmm0, 8 |
|||
movd r9d, xmm0 |
|||
|
|||
FN_PREFIX(CryptonightR_soft_aes_template_part2): |
|||
mov rsp, r10 |
|||
mov [rsp+144], ebx |
|||
mov [rsp+148], esi |
|||
mov [rsp+152], edi |
|||
mov [rsp+156], ebp |
|||
|
|||
mov edi, edi |
|||
shl rbp, 32 |
|||
or rbp, rdi |
|||
xor r8, rbp |
|||
|
|||
mov ebx, ebx |
|||
shl rsi, 32 |
|||
or rsi, rbx |
|||
xor QWORD PTR [rsp+320], rsi |
|||
|
|||
mov rbx, [rsp+160] |
|||
mov rdi, [rsp+168] |
|||
mov rbp, [rsp+176] |
|||
mov r10, [rsp+184] |
|||
|
|||
mov r9, r10 |
|||
xor r9, 16 |
|||
mov rcx, r10 |
|||
xor rcx, 32 |
|||
xor r10, 48 |
|||
mov rax, rbx |
|||
mul rdi |
|||
movdqu xmm2, XMMWORD PTR [r9+r11] |
|||
movdqu xmm1, XMMWORD PTR [rcx+r11] |
|||
pxor xmm6, xmm2 |
|||
pxor xmm6, xmm1 |
|||
paddq xmm1, xmm7 |
|||
add r8, rdx |
|||
movdqu xmm0, XMMWORD PTR [r10+r11] |
|||
pxor xmm6, xmm0 |
|||
paddq xmm0, xmm5 |
|||
paddq xmm2, xmm4 |
|||
movdqu XMMWORD PTR [r9+r11], xmm0 |
|||
movdqa xmm5, xmm4 |
|||
mov r9, QWORD PTR [rsp+320] |
|||
movdqa xmm4, xmm6 |
|||
add r9, rax |
|||
movdqu XMMWORD PTR [rcx+r11], xmm2 |
|||
movdqu XMMWORD PTR [r10+r11], xmm1 |
|||
mov r10, QWORD PTR [rsp+304] |
|||
movd r12d, xmm11 |
|||
mov QWORD PTR [r14], r8 |
|||
xor r8, rbx |
|||
mov rax, r8 |
|||
mov QWORD PTR [r14+8], r9 |
|||
and eax, 2097136 |
|||
xor r9, rbp |
|||
mov QWORD PTR [rsp+320], r9 |
|||
mov QWORD PTR [rsp+328], rax |
|||
sub r12d, 1 |
|||
jne FN_PREFIX(CryptonightR_soft_aes_template_mainloop) |
|||
|
|||
FN_PREFIX(CryptonightR_soft_aes_template_part3): |
|||
movaps xmm6, XMMWORD PTR [rsp+16] |
|||
movaps xmm7, XMMWORD PTR [rsp+32] |
|||
movaps xmm8, XMMWORD PTR [rsp+48] |
|||
movaps xmm9, XMMWORD PTR [rsp+64] |
|||
movaps xmm10, XMMWORD PTR [rsp+80] |
|||
movaps xmm11, XMMWORD PTR [rsp+96] |
|||
movaps xmm12, XMMWORD PTR [rsp+112] |
|||
movaps xmm13, XMMWORD PTR [rsp+128] |
|||
|
|||
add rsp, 232 |
|||
pop r15 |
|||
pop r14 |
|||
pop r13 |
|||
pop r12 |
|||
pop rdi |
|||
pop rsi |
|||
pop rbp |
|||
pop rbx |
|||
ret |
|||
FN_PREFIX(CryptonightR_soft_aes_template_end): |
File diff suppressed because it is too large
File diff suppressed because it is too large
@ -0,0 +1,531 @@ |
|||
PUBLIC FN_PREFIX(CryptonightR_template_part1) |
|||
PUBLIC FN_PREFIX(CryptonightR_template_mainloop) |
|||
PUBLIC FN_PREFIX(CryptonightR_template_part2) |
|||
PUBLIC FN_PREFIX(CryptonightR_template_part3) |
|||
PUBLIC FN_PREFIX(CryptonightR_template_end) |
|||
PUBLIC FN_PREFIX(CryptonightR_template_double_part1) |
|||
PUBLIC FN_PREFIX(CryptonightR_template_double_mainloop) |
|||
PUBLIC FN_PREFIX(CryptonightR_template_double_part2) |
|||
PUBLIC FN_PREFIX(CryptonightR_template_double_part3) |
|||
PUBLIC FN_PREFIX(CryptonightR_template_double_part4) |
|||
PUBLIC FN_PREFIX(CryptonightR_template_double_end) |
|||
|
|||
ALIGN(64) |
|||
FN_PREFIX(CryptonightR_template_part1): |
|||
mov QWORD PTR [rsp+16], rbx |
|||
mov QWORD PTR [rsp+24], rbp |
|||
mov QWORD PTR [rsp+32], rsi |
|||
push r10 |
|||
push r11 |
|||
push r12 |
|||
push r13 |
|||
push r14 |
|||
push r15 |
|||
push rdi |
|||
sub rsp, 64 |
|||
mov r12, rcx |
|||
mov r8, QWORD PTR [r12+32] |
|||
mov rdx, r12 |
|||
xor r8, QWORD PTR [r12] |
|||
mov r15, QWORD PTR [r12+40] |
|||
mov r9, r8 |
|||
xor r15, QWORD PTR [r12+8] |
|||
mov r11, QWORD PTR [r12+224] |
|||
mov r12, QWORD PTR [r12+56] |
|||
xor r12, QWORD PTR [rdx+24] |
|||
mov rax, QWORD PTR [rdx+48] |
|||
xor rax, QWORD PTR [rdx+16] |
|||
movaps XMMWORD PTR [rsp+48], xmm6 |
|||
movq xmm0, r12 |
|||
movaps XMMWORD PTR [rsp+32], xmm7 |
|||
movaps XMMWORD PTR [rsp+16], xmm8 |
|||
movaps XMMWORD PTR [rsp], xmm9 |
|||
mov r12, QWORD PTR [rdx+88] |
|||
xor r12, QWORD PTR [rdx+72] |
|||
movq xmm6, rax |
|||
mov rax, QWORD PTR [rdx+80] |
|||
xor rax, QWORD PTR [rdx+64] |
|||
punpcklqdq xmm6, xmm0 |
|||
and r9d, 2097136 |
|||
movq xmm0, r12 |
|||
movq xmm7, rax |
|||
punpcklqdq xmm7, xmm0 |
|||
mov r10d, r9d |
|||
movq xmm9, rsp |
|||
mov rsp, r8 |
|||
mov r8d, 524288 |
|||
|
|||
mov ebx, [rdx+96] |
|||
mov esi, [rdx+100] |
|||
mov edi, [rdx+104] |
|||
mov ebp, [rdx+108] |
|||
|
|||
ALIGN(64) |
|||
FN_PREFIX(CryptonightR_template_mainloop): |
|||
movdqa xmm5, XMMWORD PTR [r9+r11] |
|||
movq xmm0, r15 |
|||
movq xmm4, rsp |
|||
punpcklqdq xmm4, xmm0 |
|||
lea rdx, QWORD PTR [r9+r11] |
|||
|
|||
aesenc xmm5, xmm4 |
|||
|
|||
mov r13d, r9d |
|||
mov eax, r9d |
|||
xor r9d, 48 |
|||
xor r13d, 16 |
|||
xor eax, 32 |
|||
movdqu xmm0, XMMWORD PTR [r9+r11] |
|||
movaps xmm3, xmm0 |
|||
movdqu xmm2, XMMWORD PTR [r13+r11] |
|||
movdqu xmm1, XMMWORD PTR [rax+r11] |
|||
pxor xmm0, xmm2 |
|||
pxor xmm5, xmm1 |
|||
pxor xmm5, xmm0 |
|||
|
|||
movq r12, xmm5 |
|||
movd r10d, xmm5 |
|||
and r10d, 2097136 |
|||
|
|||
paddq xmm3, xmm7 |
|||
paddq xmm2, xmm6 |
|||
paddq xmm1, xmm4 |
|||
movdqu XMMWORD PTR [r13+r11], xmm3 |
|||
movdqu XMMWORD PTR [rax+r11], xmm2 |
|||
movdqu XMMWORD PTR [r9+r11], xmm1 |
|||
|
|||
movdqa xmm0, xmm5 |
|||
pxor xmm0, xmm6 |
|||
movdqu XMMWORD PTR [rdx], xmm0 |
|||
|
|||
lea r13d, [ebx+esi] |
|||
lea edx, [edi+ebp] |
|||
shl rdx, 32 |
|||
or r13, rdx |
|||
|
|||
movd eax, xmm6 |
|||
movd edx, xmm7 |
|||
pextrd r9d, xmm7, 2 |
|||
|
|||
xor r13, QWORD PTR [r10+r11] |
|||
mov r14, QWORD PTR [r10+r11+8] |
|||
|
|||
FN_PREFIX(CryptonightR_template_part2): |
|||
lea rcx, [r10+r11] |
|||
|
|||
mov eax, edi |
|||
mov edx, ebp |
|||
shl rdx, 32 |
|||
or rax, rdx |
|||
xor rsp, rax |
|||
|
|||
mov eax, ebx |
|||
mov edx, esi |
|||
shl rdx, 32 |
|||
or rax, rdx |
|||
xor r15, rax |
|||
|
|||
mov rax, r13 |
|||
mul r12 |
|||
add r15, rax |
|||
add rsp, rdx |
|||
|
|||
mov r9d, r10d |
|||
mov r12d, r10d |
|||
xor r9d, 16 |
|||
xor r12d, 32 |
|||
xor r10d, 48 |
|||
movdqa xmm1, XMMWORD PTR [r12+r11] |
|||
movaps xmm3, xmm1 |
|||
movdqa xmm2, XMMWORD PTR [r9+r11] |
|||
movdqa xmm0, XMMWORD PTR [r10+r11] |
|||
pxor xmm1, xmm2 |
|||
pxor xmm5, xmm0 |
|||
pxor xmm5, xmm1 |
|||
paddq xmm3, xmm4 |
|||
paddq xmm2, xmm6 |
|||
paddq xmm0, xmm7 |
|||
movdqu XMMWORD PTR [r9+r11], xmm0 |
|||
movdqu XMMWORD PTR [r12+r11], xmm2 |
|||
movdqu XMMWORD PTR [r10+r11], xmm3 |
|||
|
|||
movdqa xmm7, xmm6 |
|||
mov QWORD PTR [rcx], rsp |
|||
xor rsp, r13 |
|||
mov r9d, esp |
|||
mov QWORD PTR [rcx+8], r15 |
|||
and r9d, 2097136 |
|||
xor r15, r14 |
|||
movdqa xmm6, xmm5 |
|||
dec r8d |
|||
jnz FN_PREFIX(CryptonightR_template_mainloop) |
|||
|
|||
FN_PREFIX(CryptonightR_template_part3): |
|||
movq rsp, xmm9 |
|||
|
|||
mov rbx, QWORD PTR [rsp+136] |
|||
mov rbp, QWORD PTR [rsp+144] |
|||
mov rsi, QWORD PTR [rsp+152] |
|||
movaps xmm6, XMMWORD PTR [rsp+48] |
|||
movaps xmm7, XMMWORD PTR [rsp+32] |
|||
movaps xmm8, XMMWORD PTR [rsp+16] |
|||
movaps xmm9, XMMWORD PTR [rsp] |
|||
add rsp, 64 |
|||
pop rdi |
|||
pop r15 |
|||
pop r14 |
|||
pop r13 |
|||
pop r12 |
|||
pop r11 |
|||
pop r10 |
|||
ret 0 |
|||
FN_PREFIX(CryptonightR_template_end): |
|||
|
|||
ALIGN(64) |
|||
FN_PREFIX(CryptonightR_template_double_part1): |
|||
mov QWORD PTR [rsp+24], rbx |
|||
push rbp |
|||
push rsi |
|||
push rdi |
|||
push r12 |
|||
push r13 |
|||
push r14 |
|||
push r15 |
|||
sub rsp, 320 |
|||
mov r14, QWORD PTR [rcx+32] |
|||
mov r8, rcx |
|||
xor r14, QWORD PTR [rcx] |
|||
mov r12, QWORD PTR [rcx+40] |
|||
mov ebx, r14d |
|||
mov rsi, QWORD PTR [rcx+224] |
|||
and ebx, 2097136 |
|||
xor r12, QWORD PTR [rcx+8] |
|||
mov rcx, QWORD PTR [rcx+56] |
|||
xor rcx, QWORD PTR [r8+24] |
|||
mov rax, QWORD PTR [r8+48] |
|||
xor rax, QWORD PTR [r8+16] |
|||
mov r15, QWORD PTR [rdx+32] |
|||
xor r15, QWORD PTR [rdx] |
|||
movq xmm0, rcx |
|||
mov rcx, QWORD PTR [r8+88] |
|||
xor rcx, QWORD PTR [r8+72] |
|||
mov r13, QWORD PTR [rdx+40] |
|||
mov rdi, QWORD PTR [rdx+224] |
|||
xor r13, QWORD PTR [rdx+8] |
|||
movaps XMMWORD PTR [rsp+160], xmm6 |
|||
movaps XMMWORD PTR [rsp+176], xmm7 |
|||
movaps XMMWORD PTR [rsp+192], xmm8 |
|||
movaps XMMWORD PTR [rsp+208], xmm9 |
|||
movaps XMMWORD PTR [rsp+224], xmm10 |
|||
movaps XMMWORD PTR [rsp+240], xmm11 |
|||
movaps XMMWORD PTR [rsp+256], xmm12 |
|||
movaps XMMWORD PTR [rsp+272], xmm13 |
|||
movaps XMMWORD PTR [rsp+288], xmm14 |
|||
movaps XMMWORD PTR [rsp+304], xmm15 |
|||
movq xmm7, rax |
|||
mov rax, QWORD PTR [r8+80] |
|||
xor rax, QWORD PTR [r8+64] |
|||
|
|||
movaps xmm1, XMMWORD PTR [rdx+96] |
|||
movaps xmm2, XMMWORD PTR [r8+96] |
|||
movaps XMMWORD PTR [rsp], xmm1 |
|||
movaps XMMWORD PTR [rsp+16], xmm2 |
|||
|
|||
mov r8d, r15d |
|||
punpcklqdq xmm7, xmm0 |
|||
movq xmm0, rcx |
|||
mov rcx, QWORD PTR [rdx+56] |
|||
xor rcx, QWORD PTR [rdx+24] |
|||
movq xmm9, rax |
|||
mov QWORD PTR [rsp+128], rsi |
|||
mov rax, QWORD PTR [rdx+48] |
|||
xor rax, QWORD PTR [rdx+16] |
|||
punpcklqdq xmm9, xmm0 |
|||
movq xmm0, rcx |
|||
mov rcx, QWORD PTR [rdx+88] |
|||
xor rcx, QWORD PTR [rdx+72] |
|||
movq xmm8, rax |
|||
mov QWORD PTR [rsp+136], rdi |
|||
mov rax, QWORD PTR [rdx+80] |
|||
xor rax, QWORD PTR [rdx+64] |
|||
punpcklqdq xmm8, xmm0 |
|||
and r8d, 2097136 |
|||
movq xmm0, rcx |
|||
mov r11d, 524288 |
|||
movq xmm10, rax |
|||
punpcklqdq xmm10, xmm0 |
|||
|
|||
movq xmm14, QWORD PTR [rsp+128] |
|||
movq xmm15, QWORD PTR [rsp+136] |
|||
|
|||
ALIGN(64) |
|||
FN_PREFIX(CryptonightR_template_double_mainloop): |
|||
movdqu xmm6, XMMWORD PTR [rbx+rsi] |
|||
movq xmm0, r12 |
|||
mov ecx, ebx |
|||
movq xmm3, r14 |
|||
punpcklqdq xmm3, xmm0 |
|||
xor ebx, 16 |
|||
aesenc xmm6, xmm3 |
|||
movq xmm4, r15 |
|||
movdqu xmm0, XMMWORD PTR [rbx+rsi] |
|||
pxor xmm6, xmm0 |
|||
xor ebx, 48 |
|||
paddq xmm0, xmm7 |
|||
movdqu xmm1, XMMWORD PTR [rbx+rsi] |
|||
pxor xmm6, xmm1 |
|||
movdqu XMMWORD PTR [rbx+rsi], xmm0 |
|||
paddq xmm1, xmm3 |
|||
xor ebx, 16 |
|||
mov eax, ebx |
|||
xor rax, 32 |
|||
movdqu xmm0, XMMWORD PTR [rbx+rsi] |
|||
pxor xmm6, xmm0 |
|||
movq rdx, xmm6 |
|||
movdqu XMMWORD PTR [rbx+rsi], xmm1 |
|||
paddq xmm0, xmm9 |
|||
movdqu XMMWORD PTR [rax+rsi], xmm0 |
|||
movdqa xmm0, xmm6 |
|||
pxor xmm0, xmm7 |
|||
movdqu XMMWORD PTR [rcx+rsi], xmm0 |
|||
mov esi, edx |
|||
movdqu xmm5, XMMWORD PTR [r8+rdi] |
|||
and esi, 2097136 |
|||
mov ecx, r8d |
|||
movq xmm0, r13 |
|||
punpcklqdq xmm4, xmm0 |
|||
xor r8d, 16 |
|||
aesenc xmm5, xmm4 |
|||
movdqu xmm0, XMMWORD PTR [r8+rdi] |
|||
pxor xmm5, xmm0 |
|||
xor r8d, 48 |
|||
paddq xmm0, xmm8 |
|||
movdqu xmm1, XMMWORD PTR [r8+rdi] |
|||
pxor xmm5, xmm1 |
|||
movdqu XMMWORD PTR [r8+rdi], xmm0 |
|||
paddq xmm1, xmm4 |
|||
xor r8d, 16 |
|||
mov eax, r8d |
|||
xor rax, 32 |
|||
movdqu xmm0, XMMWORD PTR [r8+rdi] |
|||
pxor xmm5, xmm0 |
|||
movdqu XMMWORD PTR [r8+rdi], xmm1 |
|||
paddq xmm0, xmm10 |
|||
movdqu XMMWORD PTR [rax+rdi], xmm0 |
|||
movdqa xmm0, xmm5 |
|||
pxor xmm0, xmm8 |
|||
movdqu XMMWORD PTR [rcx+rdi], xmm0 |
|||
movq rdi, xmm5 |
|||
movq rcx, xmm14 |
|||
mov ebp, edi |
|||
mov r8, QWORD PTR [rcx+rsi] |
|||
mov r10, QWORD PTR [rcx+rsi+8] |
|||
lea r9, QWORD PTR [rcx+rsi] |
|||
xor esi, 16 |
|||
|
|||
movq xmm0, rsp |
|||
movq xmm1, rsi |
|||
movq xmm2, rdi |
|||
movq xmm11, rbp |
|||
movq xmm12, r15 |
|||
movq xmm13, rdx |
|||
mov [rsp+104], rcx |
|||
mov [rsp+112], r9 |
|||
|
|||
mov ebx, DWORD PTR [rsp+16] |
|||
mov esi, DWORD PTR [rsp+20] |
|||
mov edi, DWORD PTR [rsp+24] |
|||
mov ebp, DWORD PTR [rsp+28] |
|||
|
|||
lea eax, [ebx+esi] |
|||
lea edx, [edi+ebp] |
|||
shl rdx, 32 |
|||
or rax, rdx |
|||
xor r8, rax |
|||
|
|||
movd esp, xmm3 |
|||
pextrd r15d, xmm3, 2 |
|||
movd eax, xmm7 |
|||
movd edx, xmm9 |
|||
pextrd r9d, xmm9, 2 |
|||
|
|||
FN_PREFIX(CryptonightR_template_double_part2): |
|||
|
|||
mov eax, edi |
|||
mov edx, ebp |
|||
shl rdx, 32 |
|||
or rax, rdx |
|||
xor r14, rax |
|||
|
|||
mov eax, ebx |
|||
mov edx, esi |
|||
shl rdx, 32 |
|||
or rax, rdx |
|||
xor r12, rax |
|||
|
|||
movq rsp, xmm0 |
|||
mov DWORD PTR [rsp+16], ebx |
|||
mov DWORD PTR [rsp+20], esi |
|||
mov DWORD PTR [rsp+24], edi |
|||
mov DWORD PTR [rsp+28], ebp |
|||
|
|||
movq rsi, xmm1 |
|||
movq rdi, xmm2 |
|||
movq rbp, xmm11 |
|||
movq r15, xmm12 |
|||
movq rdx, xmm13 |
|||
mov rcx, [rsp+104] |
|||
mov r9, [rsp+112] |
|||
|
|||
mov rbx, r8 |
|||
mov rax, r8 |
|||
mul rdx |
|||
and ebp, 2097136 |
|||
mov r8, rax |
|||
movdqu xmm1, XMMWORD PTR [rcx+rsi] |
|||
pxor xmm6, xmm1 |
|||
xor esi, 48 |
|||
paddq xmm1, xmm7 |
|||
movdqu xmm2, XMMWORD PTR [rsi+rcx] |
|||
pxor xmm6, xmm2 |
|||
paddq xmm2, xmm3 |
|||
movdqu XMMWORD PTR [rsi+rcx], xmm1 |
|||
xor esi, 16 |
|||
mov eax, esi |
|||
mov rsi, rcx |
|||
movdqu xmm0, XMMWORD PTR [rax+rcx] |
|||
pxor xmm6, xmm0 |
|||
movdqu XMMWORD PTR [rax+rcx], xmm2 |
|||
paddq xmm0, xmm9 |
|||
add r12, r8 |
|||
xor rax, 32 |
|||
add r14, rdx |
|||
movdqa xmm9, xmm7 |
|||
movdqa xmm7, xmm6 |
|||
movdqu XMMWORD PTR [rax+rcx], xmm0 |
|||
mov QWORD PTR [r9+8], r12 |
|||
xor r12, r10 |
|||
mov QWORD PTR [r9], r14 |
|||
movq rcx, xmm15 |
|||
xor r14, rbx |
|||
mov r10d, ebp |
|||
mov ebx, r14d |
|||
xor ebp, 16 |
|||
and ebx, 2097136 |
|||
mov r8, QWORD PTR [r10+rcx] |
|||
mov r9, QWORD PTR [r10+rcx+8] |
|||
|
|||
movq xmm0, rsp |
|||
movq xmm1, rbx |
|||
movq xmm2, rsi |
|||
movq xmm11, rdi |
|||
movq xmm12, rbp |
|||
movq xmm13, r15 |
|||
mov [rsp+104], rcx |
|||
mov [rsp+112], r9 |
|||
|
|||
mov ebx, DWORD PTR [rsp] |
|||
mov esi, DWORD PTR [rsp+4] |
|||
mov edi, DWORD PTR [rsp+8] |
|||
mov ebp, DWORD PTR [rsp+12] |
|||
|
|||
lea eax, [ebx+esi] |
|||
lea edx, [edi+ebp] |
|||
shl rdx, 32 |
|||
or rax, rdx |
|||
|
|||
xor r8, rax |
|||
movq xmm3, r8 |
|||
|
|||
movd esp, xmm4 |
|||
pextrd r15d, xmm4, 2 |
|||
movd eax, xmm8 |
|||
movd edx, xmm10 |
|||
pextrd r9d, xmm10, 2 |
|||
|
|||
FN_PREFIX(CryptonightR_template_double_part3): |
|||
|
|||
movq r15, xmm13 |
|||
|
|||
mov eax, edi |
|||
mov edx, ebp |
|||
shl rdx, 32 |
|||
or rax, rdx |
|||
xor r15, rax |
|||
|
|||
mov eax, ebx |
|||
mov edx, esi |
|||
shl rdx, 32 |
|||
or rax, rdx |
|||
xor r13, rax |
|||
|
|||
movq rsp, xmm0 |
|||
mov DWORD PTR [rsp], ebx |
|||
mov DWORD PTR [rsp+4], esi |
|||
mov DWORD PTR [rsp+8], edi |
|||
mov DWORD PTR [rsp+12], ebp |
|||
|
|||
movq rbx, xmm1 |
|||
movq rsi, xmm2 |
|||
movq rdi, xmm11 |
|||
movq rbp, xmm12 |
|||
mov rcx, [rsp+104] |
|||
mov r9, [rsp+112] |
|||
|
|||
mov rax, r8 |
|||
mul rdi |
|||
mov rdi, rcx |
|||
mov r8, rax |
|||
movdqu xmm1, XMMWORD PTR [rbp+rcx] |
|||
pxor xmm5, xmm1 |
|||
xor ebp, 48 |
|||
paddq xmm1, xmm8 |
|||
add r13, r8 |
|||
movdqu xmm2, XMMWORD PTR [rbp+rcx] |
|||
pxor xmm5, xmm2 |
|||
add r15, rdx |
|||
movdqu XMMWORD PTR [rbp+rcx], xmm1 |
|||
paddq xmm2, xmm4 |
|||
xor ebp, 16 |
|||
mov eax, ebp |
|||
xor rax, 32 |
|||
movdqu xmm0, XMMWORD PTR [rbp+rcx] |
|||
pxor xmm5, xmm0 |
|||
movdqu XMMWORD PTR [rbp+rcx], xmm2 |
|||
paddq xmm0, xmm10 |
|||
movdqu XMMWORD PTR [rax+rcx], xmm0 |
|||
movq rax, xmm3 |
|||
movdqa xmm10, xmm8 |
|||
mov QWORD PTR [r10+rcx], r15 |
|||
movdqa xmm8, xmm5 |
|||
xor r15, rax |
|||
mov QWORD PTR [r10+rcx+8], r13 |
|||
mov r8d, r15d |
|||
xor r13, r9 |
|||
and r8d, 2097136 |
|||
dec r11d |
|||
jnz FN_PREFIX(CryptonightR_template_double_mainloop) |
|||
|
|||
FN_PREFIX(CryptonightR_template_double_part4): |
|||
|
|||
mov rbx, QWORD PTR [rsp+400] |
|||
movaps xmm6, XMMWORD PTR [rsp+160] |
|||
movaps xmm7, XMMWORD PTR [rsp+176] |
|||
movaps xmm8, XMMWORD PTR [rsp+192] |
|||
movaps xmm9, XMMWORD PTR [rsp+208] |
|||
movaps xmm10, XMMWORD PTR [rsp+224] |
|||
movaps xmm11, XMMWORD PTR [rsp+240] |
|||
movaps xmm12, XMMWORD PTR [rsp+256] |
|||
movaps xmm13, XMMWORD PTR [rsp+272] |
|||
movaps xmm14, XMMWORD PTR [rsp+288] |
|||
movaps xmm15, XMMWORD PTR [rsp+304] |
|||
add rsp, 320 |
|||
pop r15 |
|||
pop r14 |
|||
pop r13 |
|||
pop r12 |
|||
pop rdi |
|||
pop rsi |
|||
pop rbp |
|||
ret 0 |
|||
FN_PREFIX(CryptonightR_template_double_end): |
@ -0,0 +1,410 @@ |
|||
mov rax, rsp |
|||
push rbx |
|||
push rbp |
|||
push rsi |
|||
push rdi |
|||
push r12 |
|||
push r13 |
|||
push r14 |
|||
push r15 |
|||
sub rsp, 184 |
|||
|
|||
stmxcsr DWORD PTR [rsp+272] |
|||
mov DWORD PTR [rsp+276], 24448 |
|||
ldmxcsr DWORD PTR [rsp+276] |
|||
|
|||
mov r13, QWORD PTR [rcx+224] |
|||
mov r9, rdx |
|||
mov r10, QWORD PTR [rcx+32] |
|||
mov r8, rcx |
|||
xor r10, QWORD PTR [rcx] |
|||
mov r14d, 524288 |
|||
mov r11, QWORD PTR [rcx+40] |
|||
xor r11, QWORD PTR [rcx+8] |
|||
mov rsi, QWORD PTR [rdx+224] |
|||
mov rdx, QWORD PTR [rcx+56] |
|||
xor rdx, QWORD PTR [rcx+24] |
|||
mov rdi, QWORD PTR [r9+32] |
|||
xor rdi, QWORD PTR [r9] |
|||
mov rbp, QWORD PTR [r9+40] |
|||
xor rbp, QWORD PTR [r9+8] |
|||
movq xmm0, rdx |
|||
movaps XMMWORD PTR [rax-88], xmm6 |
|||
movaps XMMWORD PTR [rax-104], xmm7 |
|||
movaps XMMWORD PTR [rax-120], xmm8 |
|||
movaps XMMWORD PTR [rsp+112], xmm9 |
|||
movaps XMMWORD PTR [rsp+96], xmm10 |
|||
movaps XMMWORD PTR [rsp+80], xmm11 |
|||
movaps XMMWORD PTR [rsp+64], xmm12 |
|||
movaps XMMWORD PTR [rsp+48], xmm13 |
|||
movaps XMMWORD PTR [rsp+32], xmm14 |
|||
movaps XMMWORD PTR [rsp+16], xmm15 |
|||
mov rdx, r10 |
|||
movq xmm4, QWORD PTR [r8+96] |
|||
and edx, 2097136 |
|||
mov rax, QWORD PTR [rcx+48] |
|||
xorps xmm13, xmm13 |
|||
xor rax, QWORD PTR [rcx+16] |
|||
mov rcx, QWORD PTR [rcx+88] |
|||
xor rcx, QWORD PTR [r8+72] |
|||
movq xmm5, QWORD PTR [r8+104] |
|||
movq xmm7, rax |
|||
|
|||
mov eax, 1 |
|||
shl rax, 52 |
|||
movq xmm14, rax |
|||
punpcklqdq xmm14, xmm14 |
|||
|
|||
mov eax, 1023 |
|||
shl rax, 52 |
|||
movq xmm12, rax |
|||
punpcklqdq xmm12, xmm12 |
|||
|
|||
mov rax, QWORD PTR [r8+80] |
|||
xor rax, QWORD PTR [r8+64] |
|||
punpcklqdq xmm7, xmm0 |
|||
movq xmm0, rcx |
|||
mov rcx, QWORD PTR [r9+56] |
|||
xor rcx, QWORD PTR [r9+24] |
|||
movq xmm3, rax |
|||
mov rax, QWORD PTR [r9+48] |
|||
xor rax, QWORD PTR [r9+16] |
|||
punpcklqdq xmm3, xmm0 |
|||
movq xmm0, rcx |
|||
mov QWORD PTR [rsp], r13 |
|||
mov rcx, QWORD PTR [r9+88] |
|||
xor rcx, QWORD PTR [r9+72] |
|||
movq xmm6, rax |
|||
mov rax, QWORD PTR [r9+80] |
|||
xor rax, QWORD PTR [r9+64] |
|||
punpcklqdq xmm6, xmm0 |
|||
movq xmm0, rcx |
|||
mov QWORD PTR [rsp+256], r10 |
|||
mov rcx, rdi |
|||
mov QWORD PTR [rsp+264], r11 |
|||
movq xmm8, rax |
|||
and ecx, 2097136 |
|||
punpcklqdq xmm8, xmm0 |
|||
movq xmm0, QWORD PTR [r9+96] |
|||
punpcklqdq xmm4, xmm0 |
|||
movq xmm0, QWORD PTR [r9+104] |
|||
lea r8, QWORD PTR [rcx+rsi] |
|||
movdqu xmm11, XMMWORD PTR [r8] |
|||
punpcklqdq xmm5, xmm0 |
|||
lea r9, QWORD PTR [rdx+r13] |
|||
movdqu xmm15, XMMWORD PTR [r9] |
|||
|
|||
ALIGN(64) |
|||
main_loop_double_sandybridge: |
|||
movdqu xmm9, xmm15 |
|||
mov eax, edx |
|||
mov ebx, edx |
|||
xor eax, 16 |
|||
xor ebx, 32 |
|||
xor edx, 48 |
|||
|
|||
movq xmm0, r11 |
|||
movq xmm2, r10 |
|||
punpcklqdq xmm2, xmm0 |
|||
aesenc xmm9, xmm2 |
|||
|
|||
movdqu xmm0, XMMWORD PTR [rax+r13] |
|||
movdqu xmm1, XMMWORD PTR [rbx+r13] |
|||
paddq xmm0, xmm7 |
|||
paddq xmm1, xmm2 |
|||
movdqu XMMWORD PTR [rbx+r13], xmm0 |
|||
movdqu xmm0, XMMWORD PTR [rdx+r13] |
|||
movdqu XMMWORD PTR [rdx+r13], xmm1 |
|||
paddq xmm0, xmm3 |
|||
movdqu XMMWORD PTR [rax+r13], xmm0 |
|||
|
|||
movq r11, xmm9 |
|||
mov edx, r11d |
|||
and edx, 2097136 |
|||
movdqa xmm0, xmm9 |
|||
pxor xmm0, xmm7 |
|||
movdqu XMMWORD PTR [r9], xmm0 |
|||
|
|||
lea rbx, QWORD PTR [rdx+r13] |
|||
mov r10, QWORD PTR [rdx+r13] |
|||
|
|||
movdqu xmm10, xmm11 |
|||
movq xmm0, rbp |
|||
movq xmm11, rdi |
|||
punpcklqdq xmm11, xmm0 |
|||
aesenc xmm10, xmm11 |
|||
|
|||
mov eax, ecx |
|||
mov r12d, ecx |
|||
xor eax, 16 |
|||
xor r12d, 32 |
|||
xor ecx, 48 |
|||
|
|||
movdqu xmm0, XMMWORD PTR [rax+rsi] |
|||
paddq xmm0, xmm6 |
|||
movdqu xmm1, XMMWORD PTR [r12+rsi] |
|||
movdqu XMMWORD PTR [r12+rsi], xmm0 |
|||
paddq xmm1, xmm11 |
|||
movdqu xmm0, XMMWORD PTR [rcx+rsi] |
|||
movdqu XMMWORD PTR [rcx+rsi], xmm1 |
|||
paddq xmm0, xmm8 |
|||
movdqu XMMWORD PTR [rax+rsi], xmm0 |
|||
|
|||
movq rcx, xmm10 |
|||
and ecx, 2097136 |
|||
|
|||
movdqa xmm0, xmm10 |
|||
pxor xmm0, xmm6 |
|||
movdqu XMMWORD PTR [r8], xmm0 |
|||
mov r12, QWORD PTR [rcx+rsi] |
|||
|
|||
mov r9, QWORD PTR [rbx+8] |
|||
|
|||
xor edx, 16 |
|||
mov r8d, edx |
|||
mov r15d, edx |
|||
|
|||
movq rdx, xmm5 |
|||
shl rdx, 32 |
|||
movq rax, xmm4 |
|||
xor rdx, rax |
|||
xor r10, rdx |
|||
mov rax, r10 |
|||
mul r11 |
|||
mov r11d, r8d |
|||
xor r11d, 48 |
|||
movq xmm0, rdx |
|||
xor rdx, [r11+r13] |
|||
movq xmm1, rax |
|||
xor rax, [r11+r13+8] |
|||
punpcklqdq xmm0, xmm1 |
|||
|
|||
pxor xmm0, XMMWORD PTR [r8+r13] |
|||
xor r8d, 32 |
|||
movdqu xmm1, XMMWORD PTR [r11+r13] |
|||
paddq xmm0, xmm7 |
|||
paddq xmm1, xmm2 |
|||
movdqu XMMWORD PTR [r11+r13], xmm0 |
|||
movdqu xmm0, XMMWORD PTR [r8+r13] |
|||
movdqu XMMWORD PTR [r8+r13], xmm1 |
|||
paddq xmm0, xmm3 |
|||
movdqu XMMWORD PTR [r15+r13], xmm0 |
|||
|
|||
mov r11, QWORD PTR [rsp+256] |
|||
add r11, rdx |
|||
mov rdx, QWORD PTR [rsp+264] |
|||
add rdx, rax |
|||
mov QWORD PTR [rbx], r11 |
|||
xor r11, r10 |
|||
mov QWORD PTR [rbx+8], rdx |
|||
xor rdx, r9 |
|||
mov QWORD PTR [rsp+256], r11 |
|||
and r11d, 2097136 |
|||
mov QWORD PTR [rsp+264], rdx |
|||
mov QWORD PTR [rsp+8], r11 |
|||
lea r15, QWORD PTR [r11+r13] |
|||
movdqu xmm15, XMMWORD PTR [r11+r13] |
|||
lea r13, QWORD PTR [rsi+rcx] |
|||
movdqa xmm0, xmm5 |
|||
psrldq xmm0, 8 |
|||
movaps xmm2, xmm13 |
|||
movq r10, xmm0 |
|||
psllq xmm5, 1 |
|||
shl r10, 32 |
|||
movdqa xmm0, xmm9 |
|||
psrldq xmm0, 8 |
|||
movdqa xmm1, xmm10 |
|||
movq r11, xmm0 |
|||
psrldq xmm1, 8 |
|||
movq r8, xmm1 |
|||
psrldq xmm4, 8 |
|||
movaps xmm0, xmm13 |
|||
movq rax, xmm4 |
|||
xor r10, rax |
|||
movaps xmm1, xmm13 |
|||
xor r10, r12 |
|||
lea rax, QWORD PTR [r11+1] |
|||
shr rax, 1 |
|||
movdqa xmm3, xmm9 |
|||
punpcklqdq xmm3, xmm10 |
|||
paddq xmm5, xmm3 |
|||
movq rdx, xmm5 |
|||
psrldq xmm5, 8 |
|||
cvtsi2sd xmm2, rax |
|||
or edx, -2147483647 |
|||
lea rax, QWORD PTR [r8+1] |
|||
shr rax, 1 |
|||
movq r9, xmm5 |
|||
cvtsi2sd xmm0, rax |
|||
or r9d, -2147483647 |
|||
cvtsi2sd xmm1, rdx |
|||
unpcklpd xmm2, xmm0 |
|||
movaps xmm0, xmm13 |
|||
cvtsi2sd xmm0, r9 |
|||
unpcklpd xmm1, xmm0 |
|||
divpd xmm2, xmm1 |
|||
paddq xmm2, xmm14 |
|||
cvttsd2si rax, xmm2 |
|||
psrldq xmm2, 8 |
|||
mov rbx, rax |
|||
imul rax, rdx |
|||
sub r11, rax |
|||
js div_fix_1_sandybridge |
|||
div_fix_1_ret_sandybridge: |
|||
|
|||
cvttsd2si rdx, xmm2 |
|||
mov rax, rdx |
|||
imul rax, r9 |
|||
movd xmm2, r11d |
|||
movd xmm4, ebx |
|||
sub r8, rax |
|||
js div_fix_2_sandybridge |
|||
div_fix_2_ret_sandybridge: |
|||
|
|||
movd xmm1, r8d |
|||
movd xmm0, edx |
|||
punpckldq xmm2, xmm1 |
|||
punpckldq xmm4, xmm0 |
|||
punpckldq xmm4, xmm2 |
|||
paddq xmm3, xmm4 |
|||
movdqa xmm0, xmm3 |
|||
psrlq xmm0, 12 |
|||
paddq xmm0, xmm12 |
|||
sqrtpd xmm1, xmm0 |
|||
movq r9, xmm1 |
|||
movdqa xmm5, xmm1 |
|||
psrlq xmm5, 19 |
|||
test r9, 524287 |
|||
je sqrt_fix_1_sandybridge |
|||
sqrt_fix_1_ret_sandybridge: |
|||
|
|||
movq r9, xmm10 |
|||
psrldq xmm1, 8 |
|||
movq r8, xmm1 |
|||
test r8, 524287 |
|||
je sqrt_fix_2_sandybridge |
|||
sqrt_fix_2_ret_sandybridge: |
|||
|
|||
mov r12d, ecx |
|||
mov r8d, ecx |
|||
xor r12d, 16 |
|||
xor r8d, 32 |
|||
xor ecx, 48 |
|||
mov rax, r10 |
|||
mul r9 |
|||
movq xmm0, rax |
|||
movq xmm3, rdx |
|||
punpcklqdq xmm3, xmm0 |
|||
|
|||
movdqu xmm0, XMMWORD PTR [r12+rsi] |
|||
pxor xmm0, xmm3 |
|||
movdqu xmm1, XMMWORD PTR [r8+rsi] |
|||
xor rdx, [r8+rsi] |
|||
xor rax, [r8+rsi+8] |
|||
movdqu xmm3, XMMWORD PTR [rcx+rsi] |
|||
paddq xmm0, xmm6 |
|||
paddq xmm1, xmm11 |
|||
paddq xmm3, xmm8 |
|||
movdqu XMMWORD PTR [r8+rsi], xmm0 |
|||
movdqu XMMWORD PTR [rcx+rsi], xmm1 |
|||
movdqu XMMWORD PTR [r12+rsi], xmm3 |
|||
|
|||
add rdi, rdx |
|||
mov QWORD PTR [r13], rdi |
|||
xor rdi, r10 |
|||
mov ecx, edi |
|||
and ecx, 2097136 |
|||
lea r8, QWORD PTR [rcx+rsi] |
|||
|
|||
mov rdx, QWORD PTR [r13+8] |
|||
add rbp, rax |
|||
mov QWORD PTR [r13+8], rbp |
|||
movdqu xmm11, XMMWORD PTR [rcx+rsi] |
|||
xor rbp, rdx |
|||
mov r13, QWORD PTR [rsp] |
|||
movdqa xmm3, xmm7 |
|||
mov rdx, QWORD PTR [rsp+8] |
|||
movdqa xmm8, xmm6 |
|||
mov r10, QWORD PTR [rsp+256] |
|||
movdqa xmm7, xmm9 |
|||
mov r11, QWORD PTR [rsp+264] |
|||
movdqa xmm6, xmm10 |
|||
mov r9, r15 |
|||
dec r14d |
|||
jne main_loop_double_sandybridge |
|||
|
|||
ldmxcsr DWORD PTR [rsp+272] |
|||
movaps xmm13, XMMWORD PTR [rsp+48] |
|||
lea r11, QWORD PTR [rsp+184] |
|||
movaps xmm6, XMMWORD PTR [r11-24] |
|||
movaps xmm7, XMMWORD PTR [r11-40] |
|||
movaps xmm8, XMMWORD PTR [r11-56] |
|||
movaps xmm9, XMMWORD PTR [r11-72] |
|||
movaps xmm10, XMMWORD PTR [r11-88] |
|||
movaps xmm11, XMMWORD PTR [r11-104] |
|||
movaps xmm12, XMMWORD PTR [r11-120] |
|||
movaps xmm14, XMMWORD PTR [rsp+32] |
|||
movaps xmm15, XMMWORD PTR [rsp+16] |
|||
mov rsp, r11 |
|||
pop r15 |
|||
pop r14 |
|||
pop r13 |
|||
pop r12 |
|||
pop rdi |
|||
pop rsi |
|||
pop rbp |
|||
pop rbx |
|||
jmp cnv2_double_mainloop_asm_sandybridge_endp |
|||
|
|||
div_fix_1_sandybridge: |
|||
dec rbx |
|||
add r11, rdx |
|||
jmp div_fix_1_ret_sandybridge |
|||
|
|||
div_fix_2_sandybridge: |
|||
dec rdx |
|||
add r8, r9 |
|||
jmp div_fix_2_ret_sandybridge |
|||
|
|||
sqrt_fix_1_sandybridge: |
|||
movq r8, xmm3 |
|||
movdqa xmm0, xmm5 |
|||
psrldq xmm0, 8 |
|||
dec r9 |
|||
mov r11d, -1022 |
|||
shl r11, 32 |
|||
mov rax, r9 |
|||
shr r9, 19 |
|||
shr rax, 20 |
|||
mov rdx, r9 |
|||
sub rdx, rax |
|||
lea rdx, [rdx+r11+1] |
|||
add rax, r11 |
|||
imul rdx, rax |
|||
sub rdx, r8 |
|||
adc r9, 0 |
|||
movq xmm5, r9 |
|||
punpcklqdq xmm5, xmm0 |
|||
jmp sqrt_fix_1_ret_sandybridge |
|||
|
|||
sqrt_fix_2_sandybridge: |
|||
psrldq xmm3, 8 |
|||
movq r11, xmm3 |
|||
dec r8 |
|||
mov ebx, -1022 |
|||
shl rbx, 32 |
|||
mov rax, r8 |
|||
shr r8, 19 |
|||
shr rax, 20 |
|||
mov rdx, r8 |
|||
sub rdx, rax |
|||
lea rdx, [rdx+rbx+1] |
|||
add rax, rbx |
|||
imul rdx, rax |
|||
sub rdx, r11 |
|||
adc r8, 0 |
|||
movq xmm0, r8 |
|||
punpcklqdq xmm5, xmm0 |
|||
jmp sqrt_fix_2_ret_sandybridge |
|||
|
|||
cnv2_double_mainloop_asm_sandybridge_endp: |
@ -0,0 +1,180 @@ |
|||
mov QWORD PTR [rsp+16], rbx |
|||
mov QWORD PTR [rsp+24], rbp |
|||
mov QWORD PTR [rsp+32], rsi |
|||
push rdi |
|||
push r12 |
|||
push r13 |
|||
push r14 |
|||
push r15 |
|||
sub rsp, 64 |
|||
|
|||
stmxcsr DWORD PTR [rsp] |
|||
mov DWORD PTR [rsp+4], 24448 |
|||
ldmxcsr DWORD PTR [rsp+4] |
|||
|
|||
mov rax, QWORD PTR [rcx+48] |
|||
mov r9, rcx |
|||
xor rax, QWORD PTR [rcx+16] |
|||
mov ebp, 524288 |
|||
mov r8, QWORD PTR [rcx+32] |
|||
xor r8, QWORD PTR [rcx] |
|||
mov r11, QWORD PTR [rcx+40] |
|||
mov r10, r8 |
|||
mov rdx, QWORD PTR [rcx+56] |
|||
movq xmm3, rax |
|||
xor rdx, QWORD PTR [rcx+24] |
|||
xor r11, QWORD PTR [rcx+8] |
|||
mov rbx, QWORD PTR [rcx+224] |
|||
mov rax, QWORD PTR [r9+80] |
|||
xor rax, QWORD PTR [r9+64] |
|||
movq xmm0, rdx |
|||
mov rcx, QWORD PTR [rcx+88] |
|||
xor rcx, QWORD PTR [r9+72] |
|||
mov rdi, QWORD PTR [r9+104] |
|||
and r10d, 2097136 |
|||
movaps XMMWORD PTR [rsp+48], xmm6 |
|||
movq xmm4, rax |
|||
movaps XMMWORD PTR [rsp+32], xmm7 |
|||
movaps XMMWORD PTR [rsp+16], xmm8 |
|||
xorps xmm8, xmm8 |
|||
mov ax, 1023 |
|||
shl rax, 52 |
|||
movq xmm7, rax |
|||
mov r15, QWORD PTR [r9+96] |
|||
punpcklqdq xmm3, xmm0 |
|||
movq xmm0, rcx |
|||
punpcklqdq xmm4, xmm0 |
|||
|
|||
ALIGN(64) |
|||
cnv2_main_loop_bulldozer: |
|||
movdqa xmm5, XMMWORD PTR [r10+rbx] |
|||
movq xmm6, r8 |
|||
pinsrq xmm6, r11, 1 |
|||
lea rdx, QWORD PTR [r10+rbx] |
|||
lea r9, QWORD PTR [rdi+rdi] |
|||
shl rdi, 32 |
|||
|
|||
mov ecx, r10d |
|||
mov eax, r10d |
|||
xor ecx, 16 |
|||
xor eax, 32 |
|||
xor r10d, 48 |
|||
aesenc xmm5, xmm6 |
|||
movdqa xmm2, XMMWORD PTR [rcx+rbx] |
|||
movdqa xmm1, XMMWORD PTR [rax+rbx] |
|||
movdqa xmm0, XMMWORD PTR [r10+rbx] |
|||
paddq xmm2, xmm3 |
|||
paddq xmm1, xmm6 |
|||
paddq xmm0, xmm4 |
|||
movdqa XMMWORD PTR [rcx+rbx], xmm0 |
|||
movdqa XMMWORD PTR [rax+rbx], xmm2 |
|||
movdqa XMMWORD PTR [r10+rbx], xmm1 |
|||
|
|||
movaps xmm1, xmm8 |
|||
mov rsi, r15 |
|||
xor rsi, rdi |
|||
|
|||
mov edi, 1023 |
|||
shl rdi, 52 |
|||
|
|||
movq r14, xmm5 |
|||
pextrq rax, xmm5, 1 |
|||
|
|||
movdqa xmm0, xmm5 |
|||
pxor xmm0, xmm3 |
|||
mov r10, r14 |
|||
and r10d, 2097136 |
|||
movdqa XMMWORD PTR [rdx], xmm0 |
|||
xor rsi, QWORD PTR [r10+rbx] |
|||
lea r12, QWORD PTR [r10+rbx] |
|||
mov r13, QWORD PTR [r10+rbx+8] |
|||
|
|||
add r9d, r14d |
|||
or r9d, -2147483647 |
|||
xor edx, edx |
|||
div r9 |
|||
mov eax, eax |
|||
shl rdx, 32 |
|||
lea r15, [rax+rdx] |
|||
lea rax, [r14+r15] |
|||
shr rax, 12 |
|||
add rax, rdi |
|||
movq xmm0, rax |
|||
sqrtsd xmm1, xmm0 |
|||
movq rdi, xmm1 |
|||
test rdi, 524287 |
|||
je sqrt_fixup_bulldozer |
|||
shr rdi, 19 |
|||
|
|||
sqrt_fixup_bulldozer_ret: |
|||
mov rax, rsi |
|||
mul r14 |
|||
movq xmm1, rax |
|||
movq xmm0, rdx |
|||
punpcklqdq xmm0, xmm1 |
|||
|
|||
mov r9d, r10d |
|||
mov ecx, r10d |
|||
xor r9d, 16 |
|||
xor ecx, 32 |
|||
xor r10d, 48 |
|||
movdqa xmm1, XMMWORD PTR [rcx+rbx] |
|||
xor rdx, [rcx+rbx] |
|||
xor rax, [rcx+rbx+8] |
|||
movdqa xmm2, XMMWORD PTR [r9+rbx] |
|||
pxor xmm2, xmm0 |
|||
paddq xmm4, XMMWORD PTR [r10+rbx] |
|||
paddq xmm2, xmm3 |
|||
paddq xmm1, xmm6 |
|||
movdqa XMMWORD PTR [r9+rbx], xmm4 |
|||
movdqa XMMWORD PTR [rcx+rbx], xmm2 |
|||
movdqa XMMWORD PTR [r10+rbx], xmm1 |
|||
|
|||
movdqa xmm4, xmm3 |
|||
add r8, rdx |
|||
add r11, rax |
|||
mov QWORD PTR [r12], r8 |
|||
xor r8, rsi |
|||
mov QWORD PTR [r12+8], r11 |
|||
mov r10, r8 |
|||
xor r11, r13 |
|||
and r10d, 2097136 |
|||
movdqa xmm3, xmm5 |
|||
dec ebp |
|||
jne cnv2_main_loop_bulldozer |
|||
|
|||
ldmxcsr DWORD PTR [rsp] |
|||
movaps xmm6, XMMWORD PTR [rsp+48] |
|||
lea r11, QWORD PTR [rsp+64] |
|||
mov rbx, QWORD PTR [r11+56] |
|||
mov rbp, QWORD PTR [r11+64] |
|||
mov rsi, QWORD PTR [r11+72] |
|||
movaps xmm8, XMMWORD PTR [r11-48] |
|||
movaps xmm7, XMMWORD PTR [rsp+32] |
|||
mov rsp, r11 |
|||
pop r15 |
|||
pop r14 |
|||
pop r13 |
|||
pop r12 |
|||
pop rdi |
|||
jmp cnv2_main_loop_bulldozer_endp |
|||
|
|||
sqrt_fixup_bulldozer: |
|||
movq r9, xmm5 |
|||
add r9, r15 |
|||
dec rdi |
|||
mov edx, -1022 |
|||
shl rdx, 32 |
|||
mov rax, rdi |
|||
shr rdi, 19 |
|||
shr rax, 20 |
|||
mov rcx, rdi |
|||
sub rcx, rax |
|||
lea rcx, [rcx+rdx+1] |
|||
add rax, rdx |
|||
imul rcx, rax |
|||
sub rcx, r9 |
|||
adc rdi, 0 |
|||
jmp sqrt_fixup_bulldozer_ret |
|||
|
|||
cnv2_main_loop_bulldozer_endp: |
@ -0,0 +1,186 @@ |
|||
mov QWORD PTR [rsp+24], rbx |
|||
push rbp |
|||
push rsi |
|||
push rdi |
|||
push r12 |
|||
push r13 |
|||
push r14 |
|||
push r15 |
|||
sub rsp, 80 |
|||
|
|||
stmxcsr DWORD PTR [rsp] |
|||
mov DWORD PTR [rsp+4], 24448 |
|||
ldmxcsr DWORD PTR [rsp+4] |
|||
|
|||
mov rax, QWORD PTR [rcx+48] |
|||
mov r9, rcx |
|||
xor rax, QWORD PTR [rcx+16] |
|||
mov esi, 524288 |
|||
mov r8, QWORD PTR [rcx+32] |
|||
mov r13d, -2147483647 |
|||
xor r8, QWORD PTR [rcx] |
|||
mov r11, QWORD PTR [rcx+40] |
|||
mov r10, r8 |
|||
mov rdx, QWORD PTR [rcx+56] |
|||
movq xmm4, rax |
|||
xor rdx, QWORD PTR [rcx+24] |
|||
xor r11, QWORD PTR [rcx+8] |
|||
mov rbx, QWORD PTR [rcx+224] |
|||
mov rax, QWORD PTR [r9+80] |
|||
xor rax, QWORD PTR [r9+64] |
|||
movq xmm0, rdx |
|||
mov rcx, QWORD PTR [rcx+88] |
|||
xor rcx, QWORD PTR [r9+72] |
|||
movq xmm3, QWORD PTR [r9+104] |
|||
movaps XMMWORD PTR [rsp+64], xmm6 |
|||
movaps XMMWORD PTR [rsp+48], xmm7 |
|||
movaps XMMWORD PTR [rsp+32], xmm8 |
|||
and r10d, 2097136 |
|||
movq xmm5, rax |
|||
|
|||
xor eax, eax |
|||
mov QWORD PTR [rsp+16], rax |
|||
|
|||
mov ax, 1023 |
|||
shl rax, 52 |
|||
movq xmm8, rax |
|||
mov r15, QWORD PTR [r9+96] |
|||
punpcklqdq xmm4, xmm0 |
|||
movq xmm0, rcx |
|||
punpcklqdq xmm5, xmm0 |
|||
movdqu xmm6, XMMWORD PTR [r10+rbx] |
|||
|
|||
ALIGN(64) |
|||
main_loop_ivybridge: |
|||
lea rdx, QWORD PTR [r10+rbx] |
|||
mov ecx, r10d |
|||
mov eax, r10d |
|||
mov rdi, r15 |
|||
xor ecx, 16 |
|||
xor eax, 32 |
|||
xor r10d, 48 |
|||
movq xmm0, r11 |
|||
movq xmm7, r8 |
|||
punpcklqdq xmm7, xmm0 |
|||
aesenc xmm6, xmm7 |
|||
movq rbp, xmm6 |
|||
mov r9, rbp |
|||
and r9d, 2097136 |
|||
movdqu xmm2, XMMWORD PTR [rcx+rbx] |
|||
movdqu xmm1, XMMWORD PTR [rax+rbx] |
|||
movdqu xmm0, XMMWORD PTR [r10+rbx] |
|||
paddq xmm1, xmm7 |
|||
paddq xmm0, xmm5 |
|||
paddq xmm2, xmm4 |
|||
movdqu XMMWORD PTR [rcx+rbx], xmm0 |
|||
movdqu XMMWORD PTR [rax+rbx], xmm2 |
|||
movdqu XMMWORD PTR [r10+rbx], xmm1 |
|||
mov r10, r9 |
|||
xor r10d, 32 |
|||
movq rcx, xmm3 |
|||
mov rax, rcx |
|||
shl rax, 32 |
|||
xor rdi, rax |
|||
movdqa xmm0, xmm6 |
|||
pxor xmm0, xmm4 |
|||
movdqu XMMWORD PTR [rdx], xmm0 |
|||
xor rdi, QWORD PTR [r9+rbx] |
|||
lea r14, QWORD PTR [r9+rbx] |
|||
mov r12, QWORD PTR [r14+8] |
|||
xor edx, edx |
|||
lea r9d, DWORD PTR [ecx+ecx] |
|||
add r9d, ebp |
|||
movdqa xmm0, xmm6 |
|||
psrldq xmm0, 8 |
|||
or r9d, r13d |
|||
movq rax, xmm0 |
|||
div r9 |
|||
xorps xmm3, xmm3 |
|||
mov eax, eax |
|||
shl rdx, 32 |
|||
add rdx, rax |
|||
lea r9, QWORD PTR [rdx+rbp] |
|||
mov r15, rdx |
|||
mov rax, r9 |
|||
shr rax, 12 |
|||
movq xmm0, rax |
|||
paddq xmm0, xmm8 |
|||
sqrtsd xmm3, xmm0 |
|||
psubq xmm3, XMMWORD PTR [rsp+16] |
|||
movq rdx, xmm3 |
|||
test edx, 524287 |
|||
je sqrt_fixup_ivybridge |
|||
psrlq xmm3, 19 |
|||
sqrt_fixup_ivybridge_ret: |
|||
|
|||
mov ecx, r10d |
|||
mov rax, rdi |
|||
mul rbp |
|||
movq xmm2, rdx |
|||
xor rdx, [rcx+rbx] |
|||
add r8, rdx |
|||
mov QWORD PTR [r14], r8 |
|||
xor r8, rdi |
|||
mov edi, r8d |
|||
and edi, 2097136 |
|||
movq xmm0, rax |
|||
xor rax, [rcx+rbx+8] |
|||
add r11, rax |
|||
mov QWORD PTR [r14+8], r11 |
|||
punpcklqdq xmm2, xmm0 |
|||
|
|||
mov r9d, r10d |
|||
xor r9d, 48 |
|||
xor r10d, 16 |
|||
pxor xmm2, XMMWORD PTR [r9+rbx] |
|||
movdqu xmm0, XMMWORD PTR [r10+rbx] |
|||
paddq xmm0, xmm5 |
|||
movdqu xmm1, XMMWORD PTR [rcx+rbx] |
|||
paddq xmm2, xmm4 |
|||
paddq xmm1, xmm7 |
|||
movdqa xmm5, xmm4 |
|||
movdqu XMMWORD PTR [r9+rbx], xmm0 |
|||
movdqa xmm4, xmm6 |
|||
movdqu XMMWORD PTR [rcx+rbx], xmm2 |
|||
movdqu XMMWORD PTR [r10+rbx], xmm1 |
|||
movdqu xmm6, [rdi+rbx] |
|||
mov r10d, edi |
|||
xor r11, r12 |
|||
dec rsi |
|||
jne main_loop_ivybridge |
|||
|
|||
ldmxcsr DWORD PTR [rsp] |
|||
mov rbx, QWORD PTR [rsp+160] |
|||
movaps xmm6, XMMWORD PTR [rsp+64] |
|||
movaps xmm7, XMMWORD PTR [rsp+48] |
|||
movaps xmm8, XMMWORD PTR [rsp+32] |
|||
add rsp, 80 |
|||
pop r15 |
|||
pop r14 |
|||
pop r13 |
|||
pop r12 |
|||
pop rdi |
|||
pop rsi |
|||
pop rbp |
|||
jmp cnv2_main_loop_ivybridge_endp |
|||
|
|||
sqrt_fixup_ivybridge: |
|||
dec rdx |
|||
mov r13d, -1022 |
|||
shl r13, 32 |
|||
mov rax, rdx |
|||
shr rdx, 19 |
|||
shr rax, 20 |
|||
mov rcx, rdx |
|||
sub rcx, rax |
|||
add rax, r13 |
|||
not r13 |
|||
sub rcx, r13 |
|||
mov r13d, -2147483647 |
|||
imul rcx, rax |
|||
sub rcx, r9 |
|||
adc rdx, 0 |
|||
movq xmm3, rdx |
|||
jmp sqrt_fixup_ivybridge_ret |
|||
|
|||
cnv2_main_loop_ivybridge_endp: |
@ -0,0 +1,179 @@ |
|||
mov QWORD PTR [rsp+16], rbx |
|||
mov QWORD PTR [rsp+24], rbp |
|||
mov QWORD PTR [rsp+32], rsi |
|||
push rdi |
|||
push r12 |
|||
push r13 |
|||
push r14 |
|||
push r15 |
|||
sub rsp, 64 |
|||
|
|||
stmxcsr DWORD PTR [rsp] |
|||
mov DWORD PTR [rsp+4], 24448 |
|||
ldmxcsr DWORD PTR [rsp+4] |
|||
|
|||
mov rax, QWORD PTR [rcx+48] |
|||
mov r9, rcx |
|||
xor rax, QWORD PTR [rcx+16] |
|||
mov ebp, 524288 |
|||
mov r8, QWORD PTR [rcx+32] |
|||
xor r8, QWORD PTR [rcx] |
|||
mov r11, QWORD PTR [rcx+40] |
|||
mov r10, r8 |
|||
mov rdx, QWORD PTR [rcx+56] |
|||
movq xmm3, rax |
|||
xor rdx, QWORD PTR [rcx+24] |
|||
xor r11, QWORD PTR [rcx+8] |
|||
mov rbx, QWORD PTR [rcx+224] |
|||
mov rax, QWORD PTR [r9+80] |
|||
xor rax, QWORD PTR [r9+64] |
|||
movq xmm0, rdx |
|||
mov rcx, QWORD PTR [rcx+88] |
|||
xor rcx, QWORD PTR [r9+72] |
|||
mov rdi, QWORD PTR [r9+104] |
|||
and r10d, 2097136 |
|||
movaps XMMWORD PTR [rsp+48], xmm6 |
|||
movq xmm4, rax |
|||
movaps XMMWORD PTR [rsp+32], xmm7 |
|||
movaps XMMWORD PTR [rsp+16], xmm8 |
|||
xorps xmm8, xmm8 |
|||
mov ax, 1023 |
|||
shl rax, 52 |
|||
movq xmm7, rax |
|||
mov r15, QWORD PTR [r9+96] |
|||
punpcklqdq xmm3, xmm0 |
|||
movq xmm0, rcx |
|||
punpcklqdq xmm4, xmm0 |
|||
|
|||
ALIGN(64) |
|||
main_loop_ryzen: |
|||
movdqa xmm5, XMMWORD PTR [r10+rbx] |
|||
movq xmm0, r11 |
|||
movq xmm6, r8 |
|||
punpcklqdq xmm6, xmm0 |
|||
lea rdx, QWORD PTR [r10+rbx] |
|||
lea r9, QWORD PTR [rdi+rdi] |
|||
shl rdi, 32 |
|||
|
|||
mov ecx, r10d |
|||
mov eax, r10d |
|||
xor ecx, 16 |
|||
xor eax, 32 |
|||
xor r10d, 48 |
|||
aesenc xmm5, xmm6 |
|||
movdqa xmm2, XMMWORD PTR [rcx+rbx] |
|||
movdqa xmm1, XMMWORD PTR [rax+rbx] |
|||
movdqa xmm0, XMMWORD PTR [r10+rbx] |
|||
paddq xmm2, xmm3 |
|||
paddq xmm1, xmm6 |
|||
paddq xmm0, xmm4 |
|||
movdqa XMMWORD PTR [rcx+rbx], xmm0 |
|||
movdqa XMMWORD PTR [rax+rbx], xmm2 |
|||
movdqa XMMWORD PTR [r10+rbx], xmm1 |
|||
|
|||
movaps xmm1, xmm8 |
|||
mov rsi, r15 |
|||
xor rsi, rdi |
|||
movq r14, xmm5 |
|||
movdqa xmm0, xmm5 |
|||
pxor xmm0, xmm3 |
|||
mov r10, r14 |
|||
and r10d, 2097136 |
|||
movdqa XMMWORD PTR [rdx], xmm0 |
|||
xor rsi, QWORD PTR [r10+rbx] |
|||
lea r12, QWORD PTR [r10+rbx] |
|||
mov r13, QWORD PTR [r10+rbx+8] |
|||
|
|||
add r9d, r14d |
|||
or r9d, -2147483647 |
|||
xor edx, edx |
|||
movdqa xmm0, xmm5 |
|||
psrldq xmm0, 8 |
|||
movq rax, xmm0 |
|||
|
|||
div r9 |
|||
movq xmm0, rax |
|||
movq xmm1, rdx |
|||
punpckldq xmm0, xmm1 |
|||
movq r15, xmm0 |
|||
paddq xmm0, xmm5 |
|||
movdqa xmm2, xmm0 |
|||
psrlq xmm0, 12 |
|||
paddq xmm0, xmm7 |
|||
sqrtsd xmm1, xmm0 |
|||
movq rdi, xmm1 |
|||
test rdi, 524287 |
|||
je sqrt_fixup_ryzen |
|||
shr rdi, 19 |
|||
|
|||
sqrt_fixup_ryzen_ret: |
|||
mov rax, rsi |
|||
mul r14 |
|||
movq xmm1, rax |
|||
movq xmm0, rdx |
|||
punpcklqdq xmm0, xmm1 |
|||
|
|||
mov r9d, r10d |
|||
mov ecx, r10d |
|||
xor r9d, 16 |
|||
xor ecx, 32 |
|||
xor r10d, 48 |
|||
movdqa xmm1, XMMWORD PTR [rcx+rbx] |
|||
xor rdx, [rcx+rbx] |
|||
xor rax, [rcx+rbx+8] |
|||
movdqa xmm2, XMMWORD PTR [r9+rbx] |
|||
pxor xmm2, xmm0 |
|||
paddq xmm4, XMMWORD PTR [r10+rbx] |
|||
paddq xmm2, xmm3 |
|||
paddq xmm1, xmm6 |
|||
movdqa XMMWORD PTR [r9+rbx], xmm4 |
|||
movdqa XMMWORD PTR [rcx+rbx], xmm2 |
|||
movdqa XMMWORD PTR [r10+rbx], xmm1 |
|||
|
|||
movdqa xmm4, xmm3 |
|||
add r8, rdx |
|||
add r11, rax |
|||
mov QWORD PTR [r12], r8 |
|||
xor r8, rsi |
|||
mov QWORD PTR [r12+8], r11 |
|||
mov r10, r8 |
|||
xor r11, r13 |
|||
and r10d, 2097136 |
|||
movdqa xmm3, xmm5 |
|||
dec ebp |
|||
jne main_loop_ryzen |
|||
|
|||
ldmxcsr DWORD PTR [rsp] |
|||
movaps xmm6, XMMWORD PTR [rsp+48] |
|||
lea r11, QWORD PTR [rsp+64] |
|||
mov rbx, QWORD PTR [r11+56] |
|||
mov rbp, QWORD PTR [r11+64] |
|||
mov rsi, QWORD PTR [r11+72] |
|||
movaps xmm8, XMMWORD PTR [r11-48] |
|||
movaps xmm7, XMMWORD PTR [rsp+32] |
|||
mov rsp, r11 |
|||
pop r15 |
|||
pop r14 |
|||
pop r13 |
|||
pop r12 |
|||
pop rdi |
|||
jmp cnv2_main_loop_ryzen_endp |
|||
|
|||
sqrt_fixup_ryzen: |
|||
movq r9, xmm2 |
|||
dec rdi |
|||
mov edx, -1022 |
|||
shl rdx, 32 |
|||
mov rax, rdi |
|||
shr rdi, 19 |
|||
shr rax, 20 |
|||
mov rcx, rdi |
|||
sub rcx, rax |
|||
lea rcx, [rcx+rdx+1] |
|||
add rax, rdx |
|||
imul rcx, rax |
|||
sub rcx, r9 |
|||
adc rdi, 0 |
|||
jmp sqrt_fixup_ryzen_ret |
|||
|
|||
cnv2_main_loop_ryzen_endp: |
@ -0,0 +1,54 @@ |
|||
#ifdef __APPLE__ |
|||
# define ALIGN(x) .align 6 |
|||
#else |
|||
# define ALIGN(x) .align 64 |
|||
#endif |
|||
.intel_syntax noprefix |
|||
#ifdef __APPLE__ |
|||
# define FN_PREFIX(fn) _ ## fn |
|||
.text |
|||
#else |
|||
# define FN_PREFIX(fn) fn |
|||
.section .text |
|||
#endif |
|||
.global FN_PREFIX(cnv2_mainloop_ivybridge_asm) |
|||
.global FN_PREFIX(cnv2_mainloop_ryzen_asm) |
|||
.global FN_PREFIX(cnv2_mainloop_bulldozer_asm) |
|||
.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm) |
|||
|
|||
ALIGN(64) |
|||
FN_PREFIX(cnv2_mainloop_ivybridge_asm): |
|||
sub rsp, 48 |
|||
mov rcx, rdi |
|||
#include "cn2/cnv2_main_loop_ivybridge.inc" |
|||
add rsp, 48 |
|||
ret 0 |
|||
mov eax, 3735929054 |
|||
|
|||
ALIGN(64) |
|||
FN_PREFIX(cnv2_mainloop_ryzen_asm): |
|||
sub rsp, 48 |
|||
mov rcx, rdi |
|||
#include "cn2/cnv2_main_loop_ryzen.inc" |
|||
add rsp, 48 |
|||
ret 0 |
|||
mov eax, 3735929054 |
|||
|
|||
ALIGN(64) |
|||
FN_PREFIX(cnv2_mainloop_bulldozer_asm): |
|||
sub rsp, 48 |
|||
mov rcx, rdi |
|||
#include "cn2/cnv2_main_loop_bulldozer.inc" |
|||
add rsp, 48 |
|||
ret 0 |
|||
mov eax, 3735929054 |
|||
|
|||
ALIGN(64) |
|||
FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): |
|||
sub rsp, 48 |
|||
mov rcx, rdi |
|||
mov rdx, rsi |
|||
#include "cn2/cnv2_double_main_loop_sandybridge.inc" |
|||
add rsp, 48 |
|||
ret 0 |
|||
mov eax, 3735929054 |
@ -0,0 +1,31 @@ |
|||
#define ALIGN(x) .align 64 |
|||
.intel_syntax noprefix |
|||
.section .text |
|||
.global cnv2_mainloop_ivybridge_asm |
|||
.global cnv2_mainloop_ryzen_asm |
|||
.global cnv2_mainloop_bulldozer_asm |
|||
.global cnv2_double_mainloop_sandybridge_asm |
|||
|
|||
ALIGN(64) |
|||
cnv2_mainloop_ivybridge_asm: |
|||
#include "../cn2/cnv2_main_loop_ivybridge.inc" |
|||
ret 0 |
|||
mov eax, 3735929054 |
|||
|
|||
ALIGN(64) |
|||
cnv2_mainloop_ryzen_asm: |
|||
#include "../cn2/cnv2_main_loop_ryzen.inc" |
|||
ret 0 |
|||
mov eax, 3735929054 |
|||
|
|||
ALIGN(64) |
|||
cnv2_mainloop_bulldozer_asm: |
|||
#include "../cn2/cnv2_main_loop_bulldozer.inc" |
|||
ret 0 |
|||
mov eax, 3735929054 |
|||
|
|||
ALIGN(64) |
|||
cnv2_double_mainloop_sandybridge_asm: |
|||
#include "../cn2/cnv2_double_main_loop_sandybridge.inc" |
|||
ret 0 |
|||
mov eax, 3735929054 |
@ -1,212 +0,0 @@ |
|||
/*
|
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
* |
|||
* Additional permission under GNU GPL version 3 section 7 |
|||
* |
|||
* If you modify this Program, or any covered work, by linking or combining |
|||
* it with OpenSSL (or a modified version of that library), containing parts |
|||
* covered by the terms of OpenSSL License and SSLeay License, the licensors |
|||
* of this Program grant you additional permission to convey the resulting work. |
|||
* |
|||
*/ |
|||
|
|||
/*
|
|||
* The orginal author of this AES implementation is Karl Malbrain. |
|||
*/ |
|||
|
|||
#ifdef __GNUC__ |
|||
#include <x86intrin.h> |
|||
#else |
|||
#include <intrin.h> |
|||
#endif // __GNUC__
|
|||
|
|||
#include <inttypes.h> |
|||
|
|||
#define TABLE_ALIGN 32 |
|||
#define WPOLY 0x011b |
|||
#define N_COLS 4 |
|||
#define AES_BLOCK_SIZE 16 |
|||
#define RC_LENGTH (5 * (AES_BLOCK_SIZE / 4 - 2)) |
|||
|
|||
#if defined(_MSC_VER) |
|||
#define ALIGN __declspec(align(TABLE_ALIGN)) |
|||
#elif defined(__GNUC__) |
|||
#define ALIGN __attribute__ ((aligned(16))) |
|||
#else |
|||
#define ALIGN |
|||
#endif |
|||
|
|||
#define rf1(r,c) (r) |
|||
#define word_in(x,c) (*((uint32_t*)(x)+(c))) |
|||
#define word_out(x,c,v) (*((uint32_t*)(x)+(c)) = (v)) |
|||
|
|||
#define s(x,c) x[c] |
|||
#define si(y,x,c) (s(y,c) = word_in(x, c)) |
|||
#define so(y,x,c) word_out(y, c, s(x,c)) |
|||
#define state_in(y,x) si(y,x,0); si(y,x,1); si(y,x,2); si(y,x,3) |
|||
#define state_out(y,x) so(y,x,0); so(y,x,1); so(y,x,2); so(y,x,3) |
|||
#define round(y,x,k) \ |
|||
y[0] = (k)[0] ^ (t_fn[0][x[0] & 0xff] ^ t_fn[1][(x[1] >> 8) & 0xff] ^ t_fn[2][(x[2] >> 16) & 0xff] ^ t_fn[3][x[3] >> 24]); \ |
|||
y[1] = (k)[1] ^ (t_fn[0][x[1] & 0xff] ^ t_fn[1][(x[2] >> 8) & 0xff] ^ t_fn[2][(x[3] >> 16) & 0xff] ^ t_fn[3][x[0] >> 24]); \ |
|||
y[2] = (k)[2] ^ (t_fn[0][x[2] & 0xff] ^ t_fn[1][(x[3] >> 8) & 0xff] ^ t_fn[2][(x[0] >> 16) & 0xff] ^ t_fn[3][x[1] >> 24]); \ |
|||
y[3] = (k)[3] ^ (t_fn[0][x[3] & 0xff] ^ t_fn[1][(x[0] >> 8) & 0xff] ^ t_fn[2][(x[1] >> 16) & 0xff] ^ t_fn[3][x[2] >> 24]); |
|||
#define to_byte(x) ((x) & 0xff) |
|||
#define bval(x,n) to_byte((x) >> (8 * (n))) |
|||
|
|||
#define fwd_var(x,r,c)\ |
|||
( r == 0 ? ( c == 0 ? s(x,0) : c == 1 ? s(x,1) : c == 2 ? s(x,2) : s(x,3))\ |
|||
: r == 1 ? ( c == 0 ? s(x,1) : c == 1 ? s(x,2) : c == 2 ? s(x,3) : s(x,0))\ |
|||
: r == 2 ? ( c == 0 ? s(x,2) : c == 1 ? s(x,3) : c == 2 ? s(x,0) : s(x,1))\ |
|||
: ( c == 0 ? s(x,3) : c == 1 ? s(x,0) : c == 2 ? s(x,1) : s(x,2))) |
|||
|
|||
#define fwd_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_tables(x,t_use(f,n),fwd_var,rf1,c)) |
|||
|
|||
#define sb_data(w) {\ |
|||
w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\ |
|||
w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\ |
|||
w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\ |
|||
w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\ |
|||
w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\ |
|||
w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\ |
|||
w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\ |
|||
w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\ |
|||
w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\ |
|||
w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\ |
|||
w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\ |
|||
w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\ |
|||
w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\ |
|||
w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\ |
|||
w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\ |
|||
w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\ |
|||
w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\ |
|||
w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\ |
|||
w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\ |
|||
w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\ |
|||
w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\ |
|||
w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\ |
|||
w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\ |
|||
w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\ |
|||
w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\ |
|||
w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\ |
|||
w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\ |
|||
w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\ |
|||
w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\ |
|||
w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\ |
|||
w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\ |
|||
w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) } |
|||
|
|||
#define rc_data(w) {\ |
|||
w(0x01), w(0x02), w(0x04), w(0x08), w(0x10),w(0x20), w(0x40), w(0x80),\ |
|||
w(0x1b), w(0x36) } |
|||
|
|||
#define bytes2word(b0, b1, b2, b3) (((uint32_t)(b3) << 24) | \ |
|||
((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | (b0)) |
|||
|
|||
#define h0(x) (x) |
|||
#define w0(p) bytes2word(p, 0, 0, 0) |
|||
#define w1(p) bytes2word(0, p, 0, 0) |
|||
#define w2(p) bytes2word(0, 0, p, 0) |
|||
#define w3(p) bytes2word(0, 0, 0, p) |
|||
|
|||
#define u0(p) bytes2word(f2(p), p, p, f3(p)) |
|||
#define u1(p) bytes2word(f3(p), f2(p), p, p) |
|||
#define u2(p) bytes2word(p, f3(p), f2(p), p) |
|||
#define u3(p) bytes2word(p, p, f3(p), f2(p)) |
|||
|
|||
#define v0(p) bytes2word(fe(p), f9(p), fd(p), fb(p)) |
|||
#define v1(p) bytes2word(fb(p), fe(p), f9(p), fd(p)) |
|||
#define v2(p) bytes2word(fd(p), fb(p), fe(p), f9(p)) |
|||
#define v3(p) bytes2word(f9(p), fd(p), fb(p), fe(p)) |
|||
|
|||
#define f2(x) ((x<<1) ^ (((x>>7) & 1) * WPOLY)) |
|||
#define f4(x) ((x<<2) ^ (((x>>6) & 1) * WPOLY) ^ (((x>>6) & 2) * WPOLY)) |
|||
#define f8(x) ((x<<3) ^ (((x>>5) & 1) * WPOLY) ^ (((x>>5) & 2) * WPOLY) ^ (((x>>5) & 4) * WPOLY)) |
|||
#define f3(x) (f2(x) ^ x) |
|||
#define f9(x) (f8(x) ^ x) |
|||
#define fb(x) (f8(x) ^ f2(x) ^ x) |
|||
#define fd(x) (f8(x) ^ f4(x) ^ x) |
|||
#define fe(x) (f8(x) ^ f4(x) ^ f2(x)) |
|||
|
|||
#define t_dec(m,n) t_##m##n |
|||
#define t_set(m,n) t_##m##n |
|||
#define t_use(m,n) t_##m##n |
|||
|
|||
#define d_4(t,n,b,e,f,g,h) ALIGN const t n[4][256] = { b(e), b(f), b(g), b(h) } |
|||
|
|||
#define four_tables(x,tab,vf,rf,c) \ |
|||
(tab[0][bval(vf(x,0,c),rf(0,c))] \ |
|||
^ tab[1][bval(vf(x,1,c),rf(1,c))] \ |
|||
^ tab[2][bval(vf(x,2,c),rf(2,c))] \ |
|||
^ tab[3][bval(vf(x,3,c),rf(3,c))]) |
|||
|
|||
d_4(uint32_t, t_dec(f,n), sb_data, u0, u1, u2, u3); |
|||
|
|||
__m128i soft_aesenc(__m128i in, __m128i key) |
|||
{ |
|||
uint32_t x0, x1, x2, x3; |
|||
x0 = _mm_cvtsi128_si32(in); |
|||
x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55)); |
|||
x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xAA)); |
|||
x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xFF)); |
|||
|
|||
__m128i out = _mm_set_epi32( |
|||
(t_fn[0][x3 & 0xff] ^ t_fn[1][(x0 >> 8) & 0xff] ^ t_fn[2][(x1 >> 16) & 0xff] ^ t_fn[3][x2 >> 24]), |
|||
(t_fn[0][x2 & 0xff] ^ t_fn[1][(x3 >> 8) & 0xff] ^ t_fn[2][(x0 >> 16) & 0xff] ^ t_fn[3][x1 >> 24]), |
|||
(t_fn[0][x1 & 0xff] ^ t_fn[1][(x2 >> 8) & 0xff] ^ t_fn[2][(x3 >> 16) & 0xff] ^ t_fn[3][x0 >> 24]), |
|||
(t_fn[0][x0 & 0xff] ^ t_fn[1][(x1 >> 8) & 0xff] ^ t_fn[2][(x2 >> 16) & 0xff] ^ t_fn[3][x3 >> 24])); |
|||
|
|||
return _mm_xor_si128(out, key); |
|||
} |
|||
|
|||
uint8_t Sbox[256] = { // forward s-box
|
|||
0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, |
|||
0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, |
|||
0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, |
|||
0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, |
|||
0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, |
|||
0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, |
|||
0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, |
|||
0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, |
|||
0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, |
|||
0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, |
|||
0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, |
|||
0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, |
|||
0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, |
|||
0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, |
|||
0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, |
|||
0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16}; |
|||
|
|||
static inline void sub_word(uint8_t* key) |
|||
{ |
|||
key[0] = Sbox[key[0]]; |
|||
key[1] = Sbox[key[1]]; |
|||
key[2] = Sbox[key[2]]; |
|||
key[3] = Sbox[key[3]]; |
|||
} |
|||
|
|||
#ifdef __clang__ |
|||
uint32_t _rotr(uint32_t value, uint32_t amount) |
|||
{ |
|||
return (value >> amount) | (value << ((32 - amount) & 31)); |
|||
} |
|||
#endif |
|||
|
|||
__m128i soft_aeskeygenassist(__m128i key, uint8_t rcon) |
|||
{ |
|||
uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55)); |
|||
uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF)); |
|||
sub_word((uint8_t*)&X1); |
|||
sub_word((uint8_t*)&X3); |
|||
return _mm_set_epi32(_rotr(X3, 8) ^ rcon, X3,_rotr(X1, 8) ^ rcon, X1); |
|||
} |
@ -0,0 +1,131 @@ |
|||
/*
|
|||
* This program is free software: you can redistribute it and/or modify |
|||
* it under the terms of the GNU General Public License as published by |
|||
* the Free Software Foundation, either version 3 of the License, or |
|||
* any later version. |
|||
* |
|||
* This program is distributed in the hope that it will be useful, |
|||
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|||
* GNU General Public License for more details. |
|||
* |
|||
* You should have received a copy of the GNU General Public License |
|||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|||
* |
|||
* Additional permission under GNU GPL version 3 section 7 |
|||
* |
|||
* If you modify this Program, or any covered work, by linking or combining |
|||
* it with OpenSSL (or a modified version of that library), containing parts |
|||
* covered by the terms of OpenSSL License and SSLeay License, the licensors |
|||
* of this Program grant you additional permission to convey the resulting work. |
|||
* |
|||
*/ |
|||
|
|||
/*
|
|||
* Parts of this file are originally copyright (c) 2014-2017, The Monero Project |
|||
*/ |
|||
#pragma once |
|||
|
|||
|
|||
#if defined(XMRIG_ARM) |
|||
# include "crypto/SSE2NEON.h" |
|||
#elif defined(__GNUC__) |
|||
# include <x86intrin.h> |
|||
#else |
|||
# include <intrin.h> |
|||
#endif |
|||
|
|||
#include <inttypes.h> |
|||
|
|||
|
|||
#define saes_data(w) {\ |
|||
w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\ |
|||
w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\ |
|||
w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\ |
|||
w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\ |
|||
w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\ |
|||
w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\ |
|||
w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\ |
|||
w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\ |
|||
w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\ |
|||
w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\ |
|||
w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\ |
|||
w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\ |
|||
w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\ |
|||
w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\ |
|||
w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\ |
|||
w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\ |
|||
w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\ |
|||
w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\ |
|||
w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\ |
|||
w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\ |
|||
w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\ |
|||
w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\ |
|||
w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\ |
|||
w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\ |
|||
w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\ |
|||
w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\ |
|||
w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\ |
|||
w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\ |
|||
w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\ |
|||
w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\ |
|||
w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\ |
|||
w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) } |
|||
|
|||
#define SAES_WPOLY 0x011b |
|||
|
|||
#define saes_b2w(b0, b1, b2, b3) (((uint32_t)(b3) << 24) | \ |
|||
((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | (b0)) |
|||
|
|||
#define saes_f2(x) ((x<<1) ^ (((x>>7) & 1) * SAES_WPOLY)) |
|||
#define saes_f3(x) (saes_f2(x) ^ x) |
|||
#define saes_h0(x) (x) |
|||
|
|||
#define saes_u0(p) saes_b2w(saes_f2(p), p, p, saes_f3(p)) |
|||
#define saes_u1(p) saes_b2w(saes_f3(p), saes_f2(p), p, p) |
|||
#define saes_u2(p) saes_b2w( p, saes_f3(p), saes_f2(p), p) |
|||
#define saes_u3(p) saes_b2w( p, p, saes_f3(p), saes_f2(p)) |
|||
|
|||
__attribute__((aligned(16))) const static uint32_t saes_table[4][256] = { saes_data(saes_u0), saes_data(saes_u1), saes_data(saes_u2), saes_data(saes_u3) }; |
|||
__attribute__((aligned(16))) const static uint8_t saes_sbox[256] = saes_data(saes_h0); |
|||
|
|||
|
|||
static inline __m128i soft_aesenc(__m128i in, __m128i key) |
|||
{ |
|||
uint32_t x0, x1, x2, x3; |
|||
x0 = _mm_cvtsi128_si32(in); |
|||
x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55)); |
|||
x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xAA)); |
|||
x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xFF)); |
|||
|
|||
__m128i out = _mm_set_epi32( |
|||
(saes_table[0][x3 & 0xff] ^ saes_table[1][(x0 >> 8) & 0xff] ^ saes_table[2][(x1 >> 16) & 0xff] ^ saes_table[3][x2 >> 24]), |
|||
(saes_table[0][x2 & 0xff] ^ saes_table[1][(x3 >> 8) & 0xff] ^ saes_table[2][(x0 >> 16) & 0xff] ^ saes_table[3][x1 >> 24]), |
|||
(saes_table[0][x1 & 0xff] ^ saes_table[1][(x2 >> 8) & 0xff] ^ saes_table[2][(x3 >> 16) & 0xff] ^ saes_table[3][x0 >> 24]), |
|||
(saes_table[0][x0 & 0xff] ^ saes_table[1][(x1 >> 8) & 0xff] ^ saes_table[2][(x2 >> 16) & 0xff] ^ saes_table[3][x3 >> 24])); |
|||
|
|||
return _mm_xor_si128(out, key); |
|||
} |
|||
|
|||
static inline uint32_t sub_word(uint32_t key) |
|||
{ |
|||
return (saes_sbox[key >> 24 ] << 24) | |
|||
(saes_sbox[(key >> 16) & 0xff] << 16 ) | |
|||
(saes_sbox[(key >> 8) & 0xff] << 8 ) | |
|||
saes_sbox[key & 0xff]; |
|||
} |
|||
|
|||
#if defined(__clang__) || defined(XMRIG_ARM) |
|||
static inline uint32_t _rotr(uint32_t value, uint32_t amount) |
|||
{ |
|||
return (value >> amount) | (value << ((32 - amount) & 31)); |
|||
} |
|||
#endif |
|||
|
|||
|
|||
static inline __m128i soft_aeskeygenassist(__m128i key, uint8_t rcon) |
|||
{ |
|||
const uint32_t X1 = sub_word(_mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55))); |
|||
const uint32_t X3 = sub_word(_mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF))); |
|||
return _mm_set_epi32(_rotr(X3, 8) ^ rcon, X3, _rotr(X1, 8) ^ rcon, X1); |
|||
} |
Loading…
Reference in new issue