Browse Source

Merge pull request #3 from hellcatz/master

VerusHash V2.1
master
miketout 5 years ago
committed by GitHub
parent
commit
57b1915429
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
  1. 3
      README
  2. 18
      README.md
  3. 463
      crypto/verus_clhash.cpp
  4. 139
      crypto/verus_clhash.h
  5. 329
      crypto/verus_clhash_portable.cpp
  6. 40
      crypto/verus_hash.h
  7. 20
      test.js
  8. 163
      verushash.cc

3
README

@ -1,3 +0,0 @@
verushash-node
-----------------------
Implementation of the VerushHash hash algorithm as a node.js module

18
README.md

@ -0,0 +1,18 @@
# verushash-node
Implementation of the VerushHash V1, V2, V2.1 hash algorithm as a node.js module
## For testing purposes:
git clone https://github.com/veruscoin/verushash-node
cd verushash-node
npm install
node test.js
## Example test.js output:
'VerusHash1 Output' '2cd709e6569bd706f14a09e62042ddccfa63bd3725b21f35a8c98adbf4151184'
'VerusHash2 Output' '55cec43b110570481f6d565c3a8bb6ed174956494aeebf4c264283791dbd3ded'
'VerusHash2b Output' 'f971af1d4e551e9e71d35c6266fc19a98c6ad0388be1e9979f66921e07b5c9ac'
'VerusHash2b1 Output' '0ef8b9530ce44a7ffb9b520daf8fcf59d1d22dd1bfa1a26ef4351d51e37071b7'

463
crypto/verus_clhash.cpp

@ -17,23 +17,34 @@
*
**/
#include "verus_hash.h"
#include <assert.h>
#include <string.h>
#ifdef _WIN32
#pragma warning (disable : 4146)
#include <intrin.h>
#endif
int __cpuverusoptimized = 0x80;
#if defined(__arm__) || defined(__aarch64__)
#include "crypto/SSE2NEON.h"
#else
#include <x86intrin.h>
#endif
#ifdef __WIN32
#ifdef _WIN32
#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno)
#endif
thread_local thread_specific_ptr verusclhasher_key;
thread_local thread_specific_ptr verusclhasher_descr;
#ifdef _WIN32
// attempt to workaround horrible mingw/gcc destructor bug on Windows, which passes garbage in the this pointer
// we use the opportunity of control here to clean up all of our tls variables. we could keep a list, but this is a quick hack
#if defined(__APPLE__) || defined(_WIN32)
// attempt to workaround horrible mingw/gcc destructor bug on Windows and Mac, which passes garbage in the this pointer
// we use the opportunity of control here to clean up all of our tls variables. we could keep a list, but this is a safe,
// functional hack
thread_specific_ptr::~thread_specific_ptr() {
if (verusclhasher_key.ptr)
{
@ -44,20 +55,107 @@ thread_specific_ptr::~thread_specific_ptr() {
verusclhasher_descr.reset();
}
}
#endif
#endif // defined(__APPLE__) || defined(_WIN32)
#if defined(__arm__) || defined(__aarch64__) //intrinsics not defined in SSE2NEON.h
static inline __attribute__((always_inline)) __m128i _mm_set_epi64x(uint64_t hi, uint64_t lo)
{
__m128i result;
((uint64_t *)&result)[0] = lo;
((uint64_t *)&result)[1] = hi;
return result;
}
static inline __attribute__((always_inline)) __m128i _mm_mulhrs_epi16(__m128i _a, __m128i _b)
{
int16_t result[8];
int16_t *a = (int16_t*)&_a, *b = (int16_t*)&_b;
for (int i = 0; i < 8; i++)
{
result[i] = (int16_t)((((int32_t)(a[i]) * (int32_t)(b[i])) + 0x4000) >> 15);
}
return *(__m128i *)result;
}
int __cpuverusoptimized = 0x80;
__m128i _mm_cvtsi64_si128(uint64_t lo)
{
__m128i result;
((uint64_t *)&result)[0] = lo;
((uint64_t *)&result)[1] = 0;
return result;
}
static inline __attribute__((always_inline)) uint8x16_t _mm_aesenc_si128 (uint8x16_t a, uint8x16_t RoundKey)
{
return vaesmcq_u8(vaeseq_u8(a, (uint8x16_t){})) ^ RoundKey;
}
static inline __attribute__((always_inline)) __m128i _mm_clmulepi64_si128(const __m128i a, const __m128i &b, int imm)
{
return (__m128i)vmull_p64(vgetq_lane_u64(a, 1), vgetq_lane_u64(b,0));
}
__m128i _mm_setr_epi8(u_char c0, u_char c1, u_char c2, u_char c3, u_char c4, u_char c5, u_char c6, u_char c7, u_char c8, u_char c9, u_char c10, u_char c11, u_char c12, u_char c13, u_char c14, u_char c15)
{
__m128i result;
((uint8_t *)&result)[0] = c0;
((uint8_t *)&result)[1] = c1;
((uint8_t *)&result)[2] = c2;
((uint8_t *)&result)[3] = c3;
((uint8_t *)&result)[4] = c4;
((uint8_t *)&result)[5] = c5;
((uint8_t *)&result)[6] = c6;
((uint8_t *)&result)[7] = c7;
((uint8_t *)&result)[8] = c8;
((uint8_t *)&result)[9] = c9;
((uint8_t *)&result)[10] = c10;
((uint8_t *)&result)[11] = c11;
((uint8_t *)&result)[12] = c12;
((uint8_t *)&result)[13] = c13;
((uint8_t *)&result)[14] = c14;
((uint8_t *)&result)[15] = c15;
return result;
}
__m128i _mm_shuffle_epi8(__m128i a, __m128i b)
{
__m128i result;
for (int i = 0; i < 16; i++)
{
if (((uint8_t *)&b)[i] & 0x80)
{
((uint8_t *)&result)[i] = 0;
}
else
{
((uint8_t *)&result)[i] = ((uint8_t *)&a)[((uint8_t *)&b)[i] & 0xf];
}
}
return result;
}
int64_t _mm_cvtsi128_si64(__m128i a)
{
return ((int64_t *)&a)[0];
}
__m128i _mm_loadl_epi64(__m128i *a)
{
__m128i b = {0}; ((uint64_t*)&b)[0] = ((uint64_t*)a)[0];
return b;
}
#endif
// multiply the length and the some key, no modulo
static inline __m128i lazyLengthHash(uint64_t keylength, uint64_t length) {
static inline __attribute__((always_inline)) __m128i lazyLengthHash(uint64_t keylength, uint64_t length) {
const __m128i lengthvector = _mm_set_epi64x(keylength,length);
const __m128i clprod1 = _mm_clmulepi64_si128( lengthvector, lengthvector, 0x10);
return clprod1;
}
// modulo reduction to 64-bit value. The high 64 bits contain garbage, see precompReduction64
static inline __m128i precompReduction64_si128( __m128i A) {
static inline __attribute__((always_inline)) __m128i precompReduction64_si128( __m128i A) {
//const __m128i C = _mm_set_epi64x(1U,(1U<<4)+(1U<<3)+(1U<<1)+(1U<<0)); // C is the irreducible poly. (64,4,3,1,0)
const __m128i C = _mm_cvtsi64_si128((1U<<4)+(1U<<3)+(1U<<1)+(1U<<0));
__m128i Q2 = _mm_clmulepi64_si128( A, C, 0x01);
@ -68,12 +166,52 @@ static inline __m128i precompReduction64_si128( __m128i A) {
return final;/// WARNING: HIGH 64 BITS CONTAIN GARBAGE
}
static inline uint64_t precompReduction64( __m128i A) {
static inline __attribute__((always_inline)) uint64_t precompReduction64( __m128i A) {
return _mm_cvtsi128_si64(precompReduction64_si128(A));
}
static inline __attribute__((always_inline)) void fixupkey(__m128i **pMoveScratch, verusclhash_descr *pdesc) {
uint32_t ofs = pdesc->keySizeInBytes >> 4;
for (__m128i *pfixup = *pMoveScratch; pfixup; pfixup = *++pMoveScratch)
{
const __m128i fixup = _mm_load_si128((__m128i *)(pfixup + ofs));
_mm_store_si128((__m128i *)pfixup, fixup);
}
}
static inline __attribute__((always_inline)) void haraka512_keyed_local(unsigned char *out, const unsigned char *in, const u128 *rc) {
u128 s[4], tmp;
s[0] = LOAD(in);
s[1] = LOAD(in + 16);
s[2] = LOAD(in + 32);
s[3] = LOAD(in + 48);
AES4(s[0], s[1], s[2], s[3], 0);
MIX4(s[0], s[1], s[2], s[3]);
AES4(s[0], s[1], s[2], s[3], 8);
MIX4(s[0], s[1], s[2], s[3]);
AES4(s[0], s[1], s[2], s[3], 16);
MIX4(s[0], s[1], s[2], s[3]);
AES4(s[0], s[1], s[2], s[3], 24);
MIX4(s[0], s[1], s[2], s[3]);
AES4(s[0], s[1], s[2], s[3], 32);
MIX4(s[0], s[1], s[2], s[3]);
s[0] = _mm_xor_si128(s[0], LOAD(in));
s[1] = _mm_xor_si128(s[1], LOAD(in + 16));
s[2] = _mm_xor_si128(s[2], LOAD(in + 32));
s[3] = _mm_xor_si128(s[3], LOAD(in + 48));
TRUNCSTORE(out, s[0], s[1], s[2], s[3]);
}
// verus intermediate hash extra
static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask)
__m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask, __m128i **pMoveScratch)
{
__m128i const *pbuf;
@ -93,6 +231,9 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource
__m128i *prand = randomsource + ((selector >> 5) & keyMask);
__m128i *prandex = randomsource + ((selector >> 32) & keyMask);
*(pMoveScratch++) = prand;
*(pMoveScratch++) = prandex;
// select random start and order of pbuf processing
pbuf = buf + (selector & 3);
@ -329,20 +470,306 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource
// hashes 64 bytes only by doing a carryless multiplication and reduction of the repeated 64 byte sequence 16 times,
// returning a 64 bit hash value
uint64_t verusclhash(void * random, const unsigned char buf[64], uint64_t keyMask) {
__m128i acc = __verusclmulwithoutreduction64alignedrepeat((__m128i *)random, (const __m128i *)buf, keyMask);
uint64_t verusclhash(void * random, const unsigned char buf[64], uint64_t keyMask, __m128i **pMoveScratch) {
__m128i acc = __verusclmulwithoutreduction64alignedrepeat((__m128i *)random, (const __m128i *)buf, keyMask, pMoveScratch);
acc = _mm_xor_si128(acc, lazyLengthHash(1024, 64));
return precompReduction64(acc);
}
#ifdef __WIN32
#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno)
#endif
// hashes 64 bytes only by doing a carryless multiplication and reduction of the repeated 64 byte sequence 16 times,
// returning a 64 bit hash value
uint64_t verusclhash_sv2_1(void * random, const unsigned char buf[64], uint64_t keyMask, __m128i **pMoveScratch) {
__m128i acc = __verusclmulwithoutreduction64alignedrepeat_sv2_1((__m128i *)random, (const __m128i *)buf, keyMask, pMoveScratch);
acc = _mm_xor_si128(acc, lazyLengthHash(1024, 64));
return precompReduction64(acc);
}
__m128i __verusclmulwithoutreduction64alignedrepeat_sv2_1(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask, __m128i **pMoveScratch)
{
const __m128i pbuf_copy[4] = {_mm_xor_si128(buf[0], buf[2]), _mm_xor_si128(buf[1], buf[3]), buf[2], buf[3]};
const __m128i *pbuf;
// divide key mask by 16 from bytes to __m128i
keyMask >>= 4;
// the random buffer must have at least 32 16 byte dwords after the keymask to work with this
// algorithm. we take the value from the last element inside the keyMask + 2, as that will never
// be used to xor into the accumulator before it is hashed with other values first
__m128i acc = _mm_load_si128(randomsource + (keyMask + 2));
for (int64_t i = 0; i < 32; i++)
{
const uint64_t selector = _mm_cvtsi128_si64(acc);
// get two random locations in the key, which will be mutated and swapped
__m128i *prand = randomsource + ((selector >> 5) & keyMask);
__m128i *prandex = randomsource + ((selector >> 32) & keyMask);
*(pMoveScratch++) = prand;
*(pMoveScratch++) = prandex;
// select random start and order of pbuf processing
pbuf = pbuf_copy + (selector & 3);
switch (selector & 0x1c)
{
case 0:
{
const __m128i temp1 = _mm_load_si128(prandex);
const __m128i temp2 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
const __m128i add1 = _mm_xor_si128(temp1, temp2);
const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
acc = _mm_xor_si128(clprod1, acc);
const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
const __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
const __m128i temp12 = _mm_load_si128(prand);
_mm_store_si128(prand, tempa2);
const __m128i temp22 = _mm_load_si128(pbuf);
const __m128i add12 = _mm_xor_si128(temp12, temp22);
const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10);
acc = _mm_xor_si128(clprod12, acc);
const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
const __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
_mm_store_si128(prandex, tempb2);
break;
}
case 4:
{
const __m128i temp1 = _mm_load_si128(prand);
const __m128i temp2 = _mm_load_si128(pbuf);
const __m128i add1 = _mm_xor_si128(temp1, temp2);
const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
acc = _mm_xor_si128(clprod1, acc);
const __m128i clprod2 = _mm_clmulepi64_si128(temp2, temp2, 0x10);
acc = _mm_xor_si128(clprod2, acc);
const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
const __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
const __m128i temp12 = _mm_load_si128(prandex);
_mm_store_si128(prandex, tempa2);
const __m128i temp22 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
const __m128i add12 = _mm_xor_si128(temp12, temp22);
acc = _mm_xor_si128(add12, acc);
const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
const __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
_mm_store_si128(prand, tempb2);
break;
}
case 8:
{
const __m128i temp1 = _mm_load_si128(prandex);
const __m128i temp2 = _mm_load_si128(pbuf);
const __m128i add1 = _mm_xor_si128(temp1, temp2);
acc = _mm_xor_si128(add1, acc);
const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
const __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
const __m128i temp12 = _mm_load_si128(prand);
_mm_store_si128(prand, tempa2);
const __m128i temp22 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
const __m128i add12 = _mm_xor_si128(temp12, temp22);
const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10);
acc = _mm_xor_si128(clprod12, acc);
const __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10);
acc = _mm_xor_si128(clprod22, acc);
const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
const __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
_mm_store_si128(prandex, tempb2);
break;
}
case 0xc:
{
const __m128i temp1 = _mm_load_si128(prand);
const __m128i temp2 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
const __m128i add1 = _mm_xor_si128(temp1, temp2);
// cannot be zero here
const int32_t divisor = (uint32_t)selector;
acc = _mm_xor_si128(add1, acc);
const int64_t dividend = _mm_cvtsi128_si64(acc);
const __m128i modulo = _mm_cvtsi32_si128(dividend % divisor);
acc = _mm_xor_si128(modulo, acc);
const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
const __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
if (dividend & 1)
{
const __m128i temp12 = _mm_load_si128(prandex);
_mm_store_si128(prandex, tempa2);
const __m128i temp22 = _mm_load_si128(pbuf);
const __m128i add12 = _mm_xor_si128(temp12, temp22);
const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10);
acc = _mm_xor_si128(clprod12, acc);
const __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10);
acc = _mm_xor_si128(clprod22, acc);
const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
const __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
_mm_store_si128(prand, tempb2);
}
else
{
const __m128i tempb3 = _mm_load_si128(prandex);
_mm_store_si128(prandex, tempa2);
_mm_store_si128(prand, tempb3);
}
break;
}
case 0x10:
{
// a few AES operations
const __m128i *rc = prand;
__m128i tmp;
__m128i temp1 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
__m128i temp2 = _mm_load_si128(pbuf);
AES2(temp1, temp2, 0);
MIX2(temp1, temp2);
AES2(temp1, temp2, 4);
MIX2(temp1, temp2);
AES2(temp1, temp2, 8);
MIX2(temp1, temp2);
acc = _mm_xor_si128(temp2, _mm_xor_si128(temp1, acc));
const __m128i tempa1 = _mm_load_si128(prand);
const __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1);
const __m128i tempa3 = _mm_xor_si128(tempa1, tempa2);
const __m128i tempa4 = _mm_load_si128(prandex);
_mm_store_si128(prandex, tempa3);
_mm_store_si128(prand, tempa4);
break;
}
case 0x14:
{
// we'll just call this one the monkins loop, inspired by Chris - modified to cast to uint64_t on shift for more variability in the loop
const __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1);
__m128i tmp; // used by MIX2
uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times
__m128i *rc = prand;
uint64_t aesroundoffset = 0;
__m128i onekey;
do
{
if (selector & (((uint64_t)0x10000000) << rounds))
{
onekey = _mm_load_si128(rc++);
const __m128i temp2 = _mm_load_si128(rounds & 1 ? pbuf : buftmp);
const __m128i add1 = _mm_xor_si128(onekey, temp2);
const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
acc = _mm_xor_si128(clprod1, acc);
}
else
{
onekey = _mm_load_si128(rc++);
__m128i temp2 = _mm_load_si128(rounds & 1 ? buftmp : pbuf);
AES2(onekey, temp2, aesroundoffset);
aesroundoffset += 4;
MIX2(onekey, temp2);
acc = _mm_xor_si128(onekey, acc);
acc = _mm_xor_si128(temp2, acc);
}
} while (rounds--);
const __m128i tempa1 = _mm_load_si128(prand);
const __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1);
const __m128i tempa3 = _mm_xor_si128(tempa1, tempa2);
const __m128i tempa4 = _mm_load_si128(prandex);
_mm_store_si128(prandex, tempa3);
_mm_store_si128(prand, tempa4);
break;
}
case 0x18:
{
const __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1);
__m128i tmp; // used by MIX2
uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times
__m128i *rc = prand;
uint64_t aesroundoffset = 0;
__m128i onekey;
do
{
if (selector & (((uint64_t)0x10000000) << rounds))
{
onekey = _mm_load_si128(rc++);
const __m128i temp2 = _mm_load_si128(rounds & 1 ? pbuf : buftmp);
const __m128i add1 = _mm_xor_si128(onekey, temp2);
// cannot be zero here, may be negative
const int32_t divisor = (uint32_t)selector;
const int64_t dividend = _mm_cvtsi128_si64(add1);
const __m128i modulo = _mm_cvtsi32_si128(dividend % divisor);
acc = _mm_xor_si128(modulo, acc);
}
else
{
onekey = _mm_load_si128(rc++);
__m128i temp2 = _mm_load_si128(rounds & 1 ? buftmp : pbuf);
const __m128i add1 = _mm_xor_si128(onekey, temp2);
const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
const __m128i clprod2 = _mm_mulhrs_epi16(acc, clprod1);
acc = _mm_xor_si128(clprod2, acc);
}
} while (rounds--);
const __m128i tempa3 = _mm_load_si128(prandex);
const __m128i tempa4 = _mm_xor_si128(tempa3, acc);
_mm_store_si128(prandex, tempa4);
_mm_store_si128(prand, onekey);
break;
}
case 0x1c:
{
const __m128i temp1 = _mm_load_si128(pbuf);
const __m128i temp2 = _mm_load_si128(prandex);
const __m128i add1 = _mm_xor_si128(temp1, temp2);
const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
acc = _mm_xor_si128(clprod1, acc);
const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp2);
const __m128i tempa2 = _mm_xor_si128(tempa1, temp2);
const __m128i tempa3 = _mm_load_si128(prand);
_mm_store_si128(prand, tempa2);
acc = _mm_xor_si128(tempa3, acc);
const __m128i tempb1 = _mm_mulhrs_epi16(acc, tempa3);
const __m128i tempb2 = _mm_xor_si128(tempb1, tempa3);
_mm_store_si128(prandex, tempb2);
break;
}
}
}
return acc;
}
void *alloc_aligned_buffer(uint64_t bufSize)
{
void *answer = NULL;
if (posix_memalign(&answer, sizeof(__m256i), bufSize))
if (posix_memalign(&answer, sizeof(__m128i)*2, bufSize))
{
return NULL;
}

139
crypto/verus_clhash.h

@ -1,5 +1,5 @@
/*
* This uses veriations of the clhash algorithm for Verus Coin, licensed
* This uses variations of the clhash algorithm for Verus Coin, licensed
* with the Apache-2.0 open source license.
*
* Copyright (c) 2018 Michael Toutonghi
@ -22,6 +22,7 @@
#ifndef _WIN32
#include <cpuid.h>
#include <x86intrin.h>
#else
#include <intrin.h>
#endif // !WIN32
@ -46,7 +47,8 @@ enum {
// Any excess over a power of 2 will not get mutated, and any excess over
// power of 2 + Haraka sized key will not be used
VERUSKEYSIZE=1024 * 8 + (40 * 16),
VERUSHHASH_SOLUTION_VERSION = 1
SOLUTION_VERUSHHASH_V2 = 1, // this must be in sync with CScript::SOLUTION_VERUSV2
SOLUTION_VERUSHHASH_V2_1 = 3 // this must be in sync with CScript::ACTIVATE_VERUSHASH2_1
};
struct verusclhash_descr
@ -65,9 +67,11 @@ struct thread_specific_ptr {
std::free(ptr);
}
ptr = newptr;
}
void *get() { return ptr; }
#ifdef _WIN32 // horrible MingW and gcc thread local storage bug workaround
#if defined(__APPLE__) || defined(_WIN32)
// horrible MingW and Mac with gcc thread local storage bug workaround
~thread_specific_ptr();
#else
~thread_specific_ptr() {
@ -81,23 +85,25 @@ extern thread_local thread_specific_ptr verusclhasher_descr;
extern int __cpuverusoptimized;
__m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask, __m128i **pMoveScratch);
__m128i __verusclmulwithoutreduction64alignedrepeat_sv2_1(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask, __m128i **pMoveScratch);
__m128i __verusclmulwithoutreduction64alignedrepeat_port(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask, __m128i **pMoveScratch);
__m128i __verusclmulwithoutreduction64alignedrepeat_sv2_1_port(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask, __m128i **pMoveScratch);
inline bool IsCPUVerusOptimized()
{
#if defined(__arm__) || defined(__aarch64__)
long hwcaps= getauxval(AT_HWCAP);
if((hwcaps & HWCAP_AES) && (hwcaps & HWCAP_PMULL))
__cpuverusoptimized = true;
else
__cpuverusoptimized = false;
#else
if (__cpuverusoptimized & 0x80)
{
#ifdef _WIN32
#define bit_AVX (1 << 28)
#define bit_AES (1 << 25)
#define bit_PCLMUL (1 << 1)
// https://insufficientlycomplicated.wordpress.com/2011/11/07/detecting-intel-advanced-vector-extensions-avx-in-visual-studio/
// bool cpuAVXSuport = cpuInfo[2] & (1 << 28) || false;
int cpuInfo[4];
__cpuid(cpuInfo, 1);
__cpuverusoptimized = ((cpuInfo[2] & (bit_AVX | bit_AES | bit_PCLMUL)) == (bit_AVX | bit_AES | bit_PCLMUL));
#else
unsigned int eax,ebx,ecx,edx;
if (!__get_cpuid(1,&eax,&ebx,&ecx,&edx))
{
__cpuverusoptimized = false;
@ -106,8 +112,8 @@ inline bool IsCPUVerusOptimized()
{
__cpuverusoptimized = ((ecx & (bit_AVX | bit_AES | bit_PCLMUL)) == (bit_AVX | bit_AES | bit_PCLMUL));
}
#endif //WIN32
}
#endif
return __cpuverusoptimized;
};
@ -116,9 +122,10 @@ inline void ForceCPUVerusOptimized(bool trueorfalse)
__cpuverusoptimized = trueorfalse;
};
uint64_t verusclhash(void * random, const unsigned char buf[64], uint64_t keyMask);
uint64_t verusclhash_port(void * random, const unsigned char buf[64], uint64_t keyMask);
uint64_t verusclhash(void * random, const unsigned char buf[64], uint64_t keyMask, __m128i **pMoveScratch);
uint64_t verusclhash_port(void * random, const unsigned char buf[64], uint64_t keyMask, __m128i **pMoveScratch);
uint64_t verusclhash_sv2_1(void * random, const unsigned char buf[64], uint64_t keyMask, __m128i **pMoveScratch);
uint64_t verusclhash_sv2_1_port(void * random, const unsigned char buf[64], uint64_t keyMask, __m128i **pMoveScratch);
void *alloc_aligned_buffer(uint64_t bufSize);
#ifdef __cplusplus
@ -126,17 +133,14 @@ void *alloc_aligned_buffer(uint64_t bufSize);
#endif
#ifdef __cplusplus
#include <vector>
#include <string>
// special high speed hasher for VerusHash 2.0
struct verusclhasher {
uint64_t keySizeInBytes;
uint64_t keyMask;
uint64_t (*verusclhashfunction)(void * random, const unsigned char buf[64], uint64_t keyMask);
uint64_t (*verusclhashfunction)(void * random, const unsigned char buf[64], uint64_t keyMask, __m128i **pMoveScratch);
__m128i (*verusinternalclhashfunction)(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask, __m128i **pMoveScratch);
inline uint64_t keymask(uint64_t keysize)
static inline uint64_t keymask(uint64_t keysize)
{
int i = 0;
while (keysize >>= 1)
@ -147,15 +151,36 @@ struct verusclhasher {
}
// align on 256 bit boundary at end
verusclhasher(uint64_t keysize=VERUSKEYSIZE) : keySizeInBytes((keysize >> 5) << 5)
verusclhasher(uint64_t keysize=VERUSKEYSIZE, int solutionVersion=SOLUTION_VERUSHHASH_V2) : keySizeInBytes((keysize >> 5) << 5)
{
#ifdef __APPLE__
__tls_init();
#endif
if (IsCPUVerusOptimized())
{
verusclhashfunction = &verusclhash;
if (solutionVersion >= SOLUTION_VERUSHHASH_V2_1)
{
verusclhashfunction = &verusclhash_sv2_1;
verusinternalclhashfunction = &__verusclmulwithoutreduction64alignedrepeat_sv2_1;
}
else
{
verusclhashfunction = &verusclhash;
verusinternalclhashfunction = &__verusclmulwithoutreduction64alignedrepeat;
}
}
else
{
verusclhashfunction = &verusclhash_port;
if (solutionVersion >= SOLUTION_VERUSHHASH_V2_1)
{
verusclhashfunction = &verusclhash_sv2_1_port;
verusinternalclhashfunction = &__verusclmulwithoutreduction64alignedrepeat_sv2_1_port;
}
else
{
verusclhashfunction = &verusclhash_port;
verusinternalclhashfunction = &__verusclmulwithoutreduction64alignedrepeat_port;
}
}
// if we changed, change it
@ -166,7 +191,7 @@ struct verusclhasher {
}
// get buffer space for mutating and refresh keys
void *key = NULL;
if (!(key = verusclhasher_key.get()) &&
if (!(key = verusclhasher_key.get()) &&
(verusclhasher_key.reset((unsigned char *)alloc_aligned_buffer(keySizeInBytes << 1)), key = verusclhasher_key.get()))
{
verusclhash_descr *pdesc;
@ -194,42 +219,62 @@ struct verusclhasher {
#endif
}
// this prepares a key for hashing and mutation by copying it from the original key for this block
// WARNING!! this does not check for NULL ptr, so make sure the buffer is allocated
inline void *gethashkey()
inline void *gethasherrefresh()
{
unsigned char *ret = (unsigned char *)verusclhasher_key.get();
verusclhash_descr *pdesc = (verusclhash_descr *)verusclhasher_descr.get();
memcpy(ret, ret + pdesc->keySizeInBytes, keyMask + 1);
#ifdef VERUSHASHDEBUG
// in debug mode, ensure that what should be the same, is
assert(memcmp(ret + (keyMask + 1), ret + (pdesc->keySizeInBytes + keyMask + 1), verusclhasher_keySizeInBytes - (keyMask + 1)) == 0);
#endif
return ret;
return (unsigned char *)verusclhasher_key.get() + pdesc->keySizeInBytes;
}
inline void *gethasherrefresh()
// returns a per thread, writeable scratch pad that has enough space to hold a pointer for each
// mutated entry in the refresh hash
inline __m128i **getpmovescratch(void *hasherrefresh)
{
verusclhash_descr *pdesc = (verusclhash_descr *)verusclhasher_descr.get();
return (unsigned char *)verusclhasher_key.get() + pdesc->keySizeInBytes;
return (__m128i **)((unsigned char *)hasherrefresh + keyrefreshsize());
}
inline verusclhash_descr *gethasherdescription()
inline verusclhash_descr *gethasherdescription() const
{
return (verusclhash_descr *)verusclhasher_descr.get();
}
inline uint64_t keyrefreshsize()
inline uint64_t keyrefreshsize() const
{
return keyMask + 1;
}
inline void *fixupkey(void *hashKey, verusclhash_descr &desc)
{
unsigned char *ret = (unsigned char *)hashKey;
uint32_t ofs = desc.keySizeInBytes >> 4;
__m128i **ppfixup = getpmovescratch(ret + desc.keySizeInBytes); // past the part to refresh from
for (__m128i *pfixup = *ppfixup; pfixup; pfixup = *++ppfixup)
{
*pfixup = *(pfixup + ofs); // we hope the compiler cancels this operation out before add
}
return hashKey;
}
// this prepares a key for hashing and mutation by copying it from the original key for this block
// WARNING!! this does not check for NULL ptr, so make sure the buffer is allocated
inline void *gethashkey()
{
unsigned char *ret = (unsigned char *)verusclhasher_key.get();
return fixupkey(ret, *(verusclhash_descr *)verusclhasher_descr.get());
}
inline uint64_t operator()(const unsigned char buf[64]) const {
return (*verusclhashfunction)(verusclhasher_key.get(), buf, keyMask);
unsigned char *pkey = (unsigned char *)verusclhasher_key.get();
verusclhash_descr *pdesc = (verusclhash_descr *)verusclhasher_descr.get();
return (*verusclhashfunction)(pkey, buf, keyMask, (__m128i **)(pkey + (pdesc->keySizeInBytes + keyrefreshsize())));
}
inline uint64_t operator()(const unsigned char buf[64], void *pkey) const {
verusclhash_descr *pdesc = (verusclhash_descr *)verusclhasher_descr.get();
return (*verusclhashfunction)(pkey, buf, keyMask, (__m128i **)((unsigned char *)pkey + (pdesc->keySizeInBytes + keyrefreshsize())));
}
inline uint64_t operator()(const unsigned char buf[64], void *key) const {
return (*verusclhashfunction)(key, buf, keyMask);
inline uint64_t operator()(const unsigned char buf[64], void *pkey, __m128i **pMoveScratch) const {
return (*verusclhashfunction)((unsigned char *)pkey, buf, keyMask, pMoveScratch);
}
};

329
crypto/verus_clhash_portable.cpp

@ -27,12 +27,18 @@
#include <sys/types.h>
#endif// APPLE
#ifdef _WIN32
#ifdef __linux__
#if defined(__i386__) || defined(__X86_64__)
#include <x86intrin.h>
#elif defined(__arm__) || defined(__aarch64__)
#include "crypto/SSE2NEON.h"
#endif
#elif _WIN32
#pragma warning (disable : 4146)
#include <intrin.h>
#else
#include <x86intrin.h>
#endif //WIN32
#endif
void clmul64(uint64_t a, uint64_t b, uint64_t* r)
{
@ -141,12 +147,12 @@ inline u128 _mm_cvtsi64_si128_emu(uint64_t lo)
return result;
}
inline int64_t _mm_cvtsi128_si64_emu(__m128i &a)
inline int64_t _mm_cvtsi128_si64_emu(const __m128i &a)
{
return *(int64_t *)&a;
}
inline int32_t _mm_cvtsi128_si32_emu(__m128i &a)
inline int32_t _mm_cvtsi128_si32_emu(const __m128i &a)
{
return *(int32_t *)&a;
}
@ -324,7 +330,7 @@ static inline uint64_t precompReduction64_port( __m128i A) {
}
// verus intermediate hash extra
static __m128i __verusclmulwithoutreduction64alignedrepeat_port(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask)
__m128i __verusclmulwithoutreduction64alignedrepeat_port(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask, __m128i **pMoveScratch)
{
__m128i const *pbuf;
@ -344,6 +350,9 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat_port(__m128i *randoms
__m128i *prand = randomsource + ((selector >> 5) & keyMask);
__m128i *prandex = randomsource + ((selector >> 32) & keyMask);
*pMoveScratch++ = prand;
*pMoveScratch++ = prandex;
// select random start and order of pbuf processing
pbuf = buf + (selector & 3);
@ -508,6 +517,8 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat_port(__m128i *randoms
do
{
// note that due to compiler and CPUs, we expect this to do:
// if (selector & ((0x10000000 << rounds) & 0xffffffff) if rounds != 3 else selector & 0xffffffff80000000):
if (selector & (0x10000000 << rounds))
{
onekey = _mm_load_si128_emu(rc++);
@ -522,7 +533,9 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat_port(__m128i *randoms
__m128i temp2 = _mm_load_si128_emu(rounds & 1 ? buftmp : pbuf);
const uint64_t roundidx = aesround++ << 2;
AES2_EMU(onekey, temp2, roundidx);
MIX2_EMU(onekey, temp2);
acc = _mm_xor_si128_emu(onekey, acc);
acc = _mm_xor_si128_emu(temp2, acc);
}
@ -579,13 +592,311 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat_port(__m128i *randoms
return acc;
}
// verus intermediate hash extra
__m128i __verusclmulwithoutreduction64alignedrepeat_sv2_1_port(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask, __m128i **pMoveScratch)
{
const __m128i pbuf_copy[4] = {_mm_xor_si128(buf[0],buf[2]), _mm_xor_si128(buf[1],buf[3]), buf[2], buf[3]};
const __m128i *pbuf;
// divide key mask by 16 from bytes to __m128i
keyMask >>= 4;
// the random buffer must have at least 32 16 byte dwords after the keymask to work with this
// algorithm. we take the value from the last element inside the keyMask + 2, as that will never
// be used to xor into the accumulator before it is hashed with other values first
__m128i acc = _mm_load_si128_emu(randomsource + (keyMask + 2));
for (int64_t i = 0; i < 32; i++)
{
const uint64_t selector = _mm_cvtsi128_si64_emu(acc);
// get two random locations in the key, which will be mutated and swapped
__m128i *prand = randomsource + ((selector >> 5) & keyMask);
__m128i *prandex = randomsource + ((selector >> 32) & keyMask);
*pMoveScratch++ = prand;
*pMoveScratch++ = prandex;
// select random start and order of pbuf processing
pbuf = pbuf_copy + (selector & 3);
switch (selector & 0x1c)
{
case 0:
{
const __m128i temp1 = _mm_load_si128_emu(prandex);
const __m128i temp2 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
acc = _mm_xor_si128_emu(clprod1, acc);
const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
const __m128i temp12 = _mm_load_si128_emu(prand);
_mm_store_si128_emu(prand, tempa2);
const __m128i temp22 = _mm_load_si128_emu(pbuf);
const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10);
acc = _mm_xor_si128_emu(clprod12, acc);
const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
_mm_store_si128_emu(prandex, tempb2);
break;
}
case 4:
{
const __m128i temp1 = _mm_load_si128_emu(prand);
const __m128i temp2 = _mm_load_si128_emu(pbuf);
const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
acc = _mm_xor_si128_emu(clprod1, acc);
const __m128i clprod2 = _mm_clmulepi64_si128_emu(temp2, temp2, 0x10);
acc = _mm_xor_si128_emu(clprod2, acc);
const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
const __m128i temp12 = _mm_load_si128_emu(prandex);
_mm_store_si128_emu(prandex, tempa2);
const __m128i temp22 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
acc = _mm_xor_si128_emu(add12, acc);
const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
_mm_store_si128_emu(prand, tempb2);
break;
}
case 8:
{
const __m128i temp1 = _mm_load_si128_emu(prandex);
const __m128i temp2 = _mm_load_si128_emu(pbuf);
const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
acc = _mm_xor_si128_emu(add1, acc);
const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
const __m128i temp12 = _mm_load_si128_emu(prand);
_mm_store_si128_emu(prand, tempa2);
const __m128i temp22 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10);
acc = _mm_xor_si128_emu(clprod12, acc);
const __m128i clprod22 = _mm_clmulepi64_si128_emu(temp22, temp22, 0x10);
acc = _mm_xor_si128_emu(clprod22, acc);
const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
_mm_store_si128_emu(prandex, tempb2);
break;
}
case 0xc:
{
const __m128i temp1 = _mm_load_si128_emu(prand);
const __m128i temp2 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
// cannot be zero here
const int32_t divisor = (uint32_t)selector;
acc = _mm_xor_si128_emu(add1, acc);
const int64_t dividend = _mm_cvtsi128_si64_emu(acc);
const __m128i modulo = _mm_cvtsi32_si128_emu(dividend % divisor);
acc = _mm_xor_si128_emu(modulo, acc);
const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
if (dividend & 1)
{
const __m128i temp12 = _mm_load_si128_emu(prandex);
_mm_store_si128_emu(prandex, tempa2);
const __m128i temp22 = _mm_load_si128_emu(pbuf);
const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10);
acc = _mm_xor_si128_emu(clprod12, acc);
const __m128i clprod22 = _mm_clmulepi64_si128_emu(temp22, temp22, 0x10);
acc = _mm_xor_si128_emu(clprod22, acc);
const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
_mm_store_si128_emu(prand, tempb2);
}
else
{
const __m128i tempb3 = _mm_load_si128_emu(prandex);
_mm_store_si128_emu(prandex, tempa2);
_mm_store_si128_emu(prand, tempb3);
}
break;
}
case 0x10:
{
// a few AES operations
const __m128i *rc = prand;
__m128i tmp;
__m128i temp1 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
__m128i temp2 = _mm_load_si128_emu(pbuf);
AES2_EMU(temp1, temp2, 0);
MIX2_EMU(temp1, temp2);
AES2_EMU(temp1, temp2, 4);
MIX2_EMU(temp1, temp2);
AES2_EMU(temp1, temp2, 8);
MIX2_EMU(temp1, temp2);
acc = _mm_xor_si128_emu(temp1, acc);
acc = _mm_xor_si128_emu(temp2, acc);
const __m128i tempa1 = _mm_load_si128_emu(prand);
const __m128i tempa2 = _mm_mulhrs_epi16_emu(acc, tempa1);
const __m128i tempa3 = _mm_xor_si128_emu(tempa1, tempa2);
const __m128i tempa4 = _mm_load_si128_emu(prandex);
_mm_store_si128_emu(prandex, tempa3);
_mm_store_si128_emu(prand, tempa4);
break;
}
case 0x14:
{
// we'll just call this one the monkins loop, inspired by Chris
const __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1);
__m128i tmp; // used by MIX2
uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times
__m128i *rc = prand;
uint64_t aesround = 0;
__m128i onekey;
do
{
// this is simplified over the original verus_clhash
if (selector & (((uint64_t)0x10000000) << rounds))
{
onekey = _mm_load_si128_emu(rc++);
const __m128i temp2 = _mm_load_si128_emu(rounds & 1 ? pbuf : buftmp);
const __m128i add1 = _mm_xor_si128_emu(onekey, temp2);
const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
acc = _mm_xor_si128_emu(clprod1, acc);
}
else
{
onekey = _mm_load_si128_emu(rc++);
__m128i temp2 = _mm_load_si128_emu(rounds & 1 ? buftmp : pbuf);
const uint64_t roundidx = aesround++ << 2;
AES2_EMU(onekey, temp2, roundidx);
MIX2_EMU(onekey, temp2);
acc = _mm_xor_si128_emu(onekey, acc);
acc = _mm_xor_si128_emu(temp2, acc);
}
} while (rounds--);
const __m128i tempa1 = _mm_load_si128_emu(prand);
const __m128i tempa2 = _mm_mulhrs_epi16_emu(acc, tempa1);
const __m128i tempa3 = _mm_xor_si128_emu(tempa1, tempa2);
const __m128i tempa4 = _mm_load_si128_emu(prandex);
_mm_store_si128_emu(prandex, tempa3);
_mm_store_si128_emu(prand, tempa4);
break;
}
case 0x18:
{
const __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1);
__m128i tmp; // used by MIX2
uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times
__m128i *rc = prand;
uint64_t aesroundoffset = 0;
__m128i onekey;
do
{
if (selector & (((uint64_t)0x10000000) << rounds))
{
onekey = _mm_load_si128_emu(rc++);
const __m128i temp2 = _mm_load_si128_emu(rounds & 1 ? pbuf : buftmp);
const __m128i add1 = _mm_xor_si128_emu(onekey, temp2);
// cannot be zero here, may be negative
const int32_t divisor = (uint32_t)selector;
const int64_t dividend = _mm_cvtsi128_si64_emu(add1);
const __m128i modulo = _mm_cvtsi32_si128_emu(dividend % divisor);
acc = _mm_xor_si128_emu(modulo, acc);
}
else
{
onekey = _mm_load_si128_emu(rc++);
__m128i temp2 = _mm_load_si128_emu(rounds & 1 ? buftmp : pbuf);
const __m128i add1 = _mm_xor_si128_emu(onekey, temp2);
const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
const __m128i clprod2 = _mm_mulhrs_epi16_emu(acc, clprod1);
acc = _mm_xor_si128_emu(clprod2, acc);
}
} while (rounds--);
const __m128i tempa3 = _mm_load_si128_emu(prandex);
const __m128i tempa4 = _mm_xor_si128_emu(tempa3, acc);
_mm_store_si128_emu(prandex, tempa4);
_mm_store_si128_emu(prand, onekey);
break;
}
case 0x1c:
{
const __m128i temp1 = _mm_load_si128_emu(pbuf);
const __m128i temp2 = _mm_load_si128_emu(prandex);
const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
acc = _mm_xor_si128_emu(clprod1, acc);
const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp2);
const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp2);
const __m128i tempa3 = _mm_load_si128_emu(prand);
_mm_store_si128_emu(prand, tempa2);
acc = _mm_xor_si128_emu(tempa3, acc);
const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, tempa3);
const __m128i tempb2 = _mm_xor_si128_emu(tempb1, tempa3);
_mm_store_si128_emu(prandex, tempb2);
break;
}
}
}
return acc;
}
// hashes 64 bytes only by doing a carryless multiplication and reduction of the repeated 64 byte sequence 16 times,
// returning a 64 bit hash value
uint64_t verusclhash_port(void * random, const unsigned char buf[64], uint64_t keyMask, __m128i **pMoveScratch) {
__m128i * rs64 = (__m128i *)random;
const __m128i * string = (const __m128i *) buf;
__m128i acc = __verusclmulwithoutreduction64alignedrepeat_port(rs64, string, keyMask, pMoveScratch);
acc = _mm_xor_si128_emu(acc, lazyLengthHash_port(1024, 64));
return precompReduction64_port(acc);
}
// hashes 64 bytes only by doing a carryless multiplication and reduction of the repeated 64 byte sequence 16 times,
// returning a 64 bit hash value
uint64_t verusclhash_port(void * random, const unsigned char buf[64], uint64_t keyMask) {
uint64_t verusclhash_sv2_1_port(void * random, const unsigned char buf[64], uint64_t keyMask, __m128i **pMoveScratch) {
__m128i * rs64 = (__m128i *)random;
const __m128i * string = (const __m128i *) buf;
__m128i acc = __verusclmulwithoutreduction64alignedrepeat_port(rs64, string, keyMask);
__m128i acc = __verusclmulwithoutreduction64alignedrepeat_sv2_1_port(rs64, string, keyMask, pMoveScratch);
acc = _mm_xor_si128_emu(acc, lazyLengthHash_port(1024, 64));
return precompReduction64_port(acc);
}

40
crypto/verus_hash.h

@ -84,7 +84,7 @@ class CVerusHashV2
verusclhasher vclh;
CVerusHashV2() : vclh() {
CVerusHashV2(int solutionVerusion=SOLUTION_VERUSHHASH_V2) : vclh(VERUSKEYSIZE, solutionVerusion) {
// we must have allocated key space, or can't run
if (!verusclhasher_key.get())
{
@ -117,11 +117,11 @@ class CVerusHashV2
inline void FillExtra(const T *_data)
{
unsigned char *data = (unsigned char *)_data;
unsigned int pos = curPos;
unsigned int left = 32 - pos;
int pos = curPos;
int left = 32 - pos;
do
{
unsigned int len = left > sizeof(T) ? sizeof(T) : left;
int len = left > sizeof(T) ? sizeof(T) : left;
std::memcpy(curBuf + 32 + pos, data, len);
pos += len;
left -= len;
@ -146,13 +146,15 @@ class CVerusHashV2
{
unsigned char *key = (unsigned char *)verusclhasher_key.get();
verusclhash_descr *pdesc = (verusclhash_descr *)verusclhasher_descr.get();
int size = pdesc->keySizeInBytes;
int refreshsize = verusclhasher::keymask(size) + 1;
// skip keygen if it is the current key
if (pdesc->seed != *((uint256 *)seedBytes32))
{
// generate a new key by chain hashing with Haraka256 from the last curbuf
int n256blks = pdesc->keySizeInBytes >> 5;
int nbytesExtra = pdesc->keySizeInBytes & 0x1f;
unsigned char *pkey = key + pdesc->keySizeInBytes;
int n256blks = size >> 5;
int nbytesExtra = size & 0x1f;
unsigned char *pkey = key;
unsigned char *psrc = seedBytes32;
for (int i = 0; i < n256blks; i++)
{
@ -167,8 +169,14 @@ class CVerusHashV2
memcpy(pkey, buf, nbytesExtra);
}
pdesc->seed = *((uint256 *)seedBytes32);
memcpy(key + size, key, refreshsize);
}
memcpy(key, key + pdesc->keySizeInBytes, pdesc->keySizeInBytes);
else
{
memcpy(key, key + size, refreshsize);
}
memset((unsigned char *)key + (size + refreshsize), 0, size - refreshsize);
return (u128 *)key;
}
@ -210,22 +218,6 @@ class CVerusHashV2
// get the final hash with a mutated dynamic key for each hash result
(*haraka512KeyedFunction)(hash, curBuf, key + IntermediateTo128Offset(intermediate));
/*
// TEST BEGIN
// test against the portable version
uint256 testHash1 = *(uint256 *)hash, testHash2;
FillExtra((u128 *)curBuf);
u128 *hashKey = ((u128 *)vclh.gethashkey());
uint64_t temp = verusclhash_port(key, curBuf, vclh.keyMask);
FillExtra(&temp);
haraka512_keyed((unsigned char *)&testHash2, curBuf, hashKey + IntermediateTo128Offset(intermediate));
if (testHash1 != testHash2)
{
printf("Portable version failed! intermediate1: %lx, intermediate2: %lx\n", intermediate, temp);
}
// END TEST
*/
}
inline unsigned char *CurBuffer()

20
test.js

@ -12,7 +12,7 @@ var reverseHex = function (hex) {
};
var numWorkers = require('os').cpus().length;
numWorkers = 20;
numWorkers = 1; /* increase for multi-thread testing of data collision */
if (cluster.isMaster) {
@ -28,13 +28,15 @@ if (cluster.isMaster) {
} else {
var output = vh.hash(Buffer.from('Test1234','utf8'));
console.log(process.pid,'VerusHash1 Output', reverseHex(output.toString('hex')), '\n');
output = vh.init().update(Buffer.from('Test','utf8')).update(Buffer.from('123','utf8')).update(Buffer.from('4','utf8')).digest();
console.log(process.pid,'VerusHash1 Output', reverseHex(output.toString('hex')), '\n');
var output = vh.hash(Buffer.from('Test1234Test1234Test1234Test1234Test1234Test1234Test1234Test1234Test1234Test1234Test1234Test1234','utf8'));
console.log(process.pid,'VerusHash1 Output', reverseHex(output.toString('hex')), '\n');
output = vh.hash2(Buffer.from('Test1234','utf8'));
console.log(process.pid,'VerusHash2 Output', reverseHex(output.toString('hex')), '\n');
output = vh.init().update2(Buffer.from('Test','utf8')).update2(Buffer.from('123','utf8')).update2(Buffer.from('4','utf8')).digest2();
console.log(process.pid,'VerusHash2 Output', reverseHex(output.toString('hex')), '\n');
output = vh.hash2(Buffer.from('Test1234Test1234Test1234Test1234Test1234Test1234Test1234Test1234Test1234Test1234Test1234Test1234','utf8'));
console.log(process.pid,'VerusHash2 Output', reverseHex(output.toString('hex')), '\n');
output = vh.hash2b(Buffer.from('Test1234Test1234Test1234Test1234Test1234Test1234Test1234Test1234Test1234Test1234Test1234Test1234','utf8'));
console.log(process.pid,'VerusHash2b Output', reverseHex(output.toString('hex')), '\n');
output = vh.hash2b1(Buffer.from('Test1234Test1234Test1234Test1234Test1234Test1234Test1234Test1234Test1234Test1234Test1234Test1234','utf8'));
console.log(process.pid,'VerusHash2b1 Output', reverseHex(output.toString('hex')), '\n');
}

163
verushash.cc

@ -11,6 +11,8 @@ using namespace v8;
CVerusHash* vh;
CVerusHashV2* vh2;
CVerusHashV2* vh2b1;
bool initialized = false;
void initialize()
@ -20,8 +22,11 @@ void initialize()
CVerusHash::init();
CVerusHashV2::init();
}
vh = new CVerusHash();
vh2 = new CVerusHashV2();
vh2 = new CVerusHashV2(SOLUTION_VERUSHHASH_V2);
vh2b1 = new CVerusHashV2(SOLUTION_VERUSHHASH_V2_1);
initialized = true;
}
@ -30,69 +35,23 @@ void verusInit(const v8::FunctionCallbackInfo<Value>& args) {
args.GetReturnValue().Set(args.This());
}
void verusUpdate(const v8::FunctionCallbackInfo<Value>& args) {
void verusHash(const v8::FunctionCallbackInfo<Value>& args) {
Isolate* isolate = Isolate::GetCurrent();
HandleScope scope(isolate);
if (initialized == false){
isolate->ThrowException(
Exception::TypeError(String::NewFromUtf8(isolate, "call init() first!"))
);
}
if (args.Length() < 1) {
isolate->ThrowException(
Exception::TypeError(String::NewFromUtf8(isolate, "Wrong number of arguments"))
);
return;
}
Local<Object> buffer = args[0]->ToObject();
if(!node::Buffer::HasInstance(buffer)) {
MaybeLocal<Object> maybeBuffer = Nan::To<v8::Object>(args[0]);
Local<Object> buffer;
if (maybeBuffer.ToLocal(&buffer) != true) {
isolate->ThrowException(
Exception::TypeError(String::NewFromUtf8(isolate, "Invalid buffer objects."))
);
return;
}
const char *buff = node::Buffer::Data(buffer);
vh->Write((const unsigned char *)buff, node::Buffer::Length(buffer));
args.GetReturnValue().Set(args.This());
}
void verusDigest(const v8::FunctionCallbackInfo<Value>& args) {
Isolate* isolate = Isolate::GetCurrent();
HandleScope scope(isolate);
if (initialized == false){
isolate->ThrowException(
Exception::TypeError(String::NewFromUtf8(isolate, "call init() first!"))
);
}
char *result = new char[32];
vh->Finalize((unsigned char *)result);
args.GetReturnValue().Set(Nan::NewBuffer(result, 32).ToLocalChecked());
}
void verusReset(const v8::FunctionCallbackInfo<Value>& args) {
Isolate* isolate = Isolate::GetCurrent();
HandleScope scope(isolate);
if (initialized == false){
isolate->ThrowException(
Exception::TypeError(String::NewFromUtf8(isolate, "call init() first!"))
);
}
vh->Reset();
args.GetReturnValue().Set(args.This());
}
void verusHash(const v8::FunctionCallbackInfo<Value>& args) {
Isolate* isolate = Isolate::GetCurrent();
HandleScope scope(isolate);
if (args.Length() < 1) {
isolate->ThrowException(
Exception::TypeError(String::NewFromUtf8(isolate, "Wrong number of arguments"))
);
return;
}
Local<Object> buffer = args[0]->ToObject();
if(!node::Buffer::HasInstance(buffer)) {
isolate->ThrowException(
Exception::TypeError(String::NewFromUtf8(isolate, "Invalid buffer objects."))
@ -108,25 +67,26 @@ void verusHash(const v8::FunctionCallbackInfo<Value>& args) {
initialize();
}
verus_hash(result, buff, node::Buffer::Length(buffer));
args.GetReturnValue().Set(Nan::NewBuffer(result, 32).ToLocalChecked());
}
void verusUpdateV2(const v8::FunctionCallbackInfo<Value>& args) {
void verusHashV2(const v8::FunctionCallbackInfo<Value>& args) {
Isolate* isolate = Isolate::GetCurrent();
HandleScope scope(isolate);
if (initialized == false){
if (args.Length() < 1) {
isolate->ThrowException(
Exception::TypeError(String::NewFromUtf8(isolate, "call init() first!"))
Exception::TypeError(String::NewFromUtf8(isolate, "Wrong number of arguments"))
);
return;
}
if (args.Length() < 1) {
MaybeLocal<Object> maybeBuffer = Nan::To<v8::Object>(args[0]);
Local<Object> buffer;
if (maybeBuffer.ToLocal(&buffer) != true) {
isolate->ThrowException(
Exception::TypeError(String::NewFromUtf8(isolate, "Wrong number of arguments"))
Exception::TypeError(String::NewFromUtf8(isolate, "Invalid buffer objects."))
);
return;
}
Local<Object> buffer = args[0]->ToObject();
if(!node::Buffer::HasInstance(buffer)) {
isolate->ThrowException(
Exception::TypeError(String::NewFromUtf8(isolate, "Invalid buffer objects."))
@ -135,50 +95,20 @@ void verusUpdateV2(const v8::FunctionCallbackInfo<Value>& args) {
}
const char *buff = node::Buffer::Data(buffer);
vh2->Write((const unsigned char *)buff, node::Buffer::Length(buffer));
args.GetReturnValue().Set(args.This());
}
void verusDigestV2(const v8::FunctionCallbackInfo<Value>& args) {
Isolate* isolate = Isolate::GetCurrent();
HandleScope scope(isolate);
if (initialized == false){
isolate->ThrowException(
Exception::TypeError(String::NewFromUtf8(isolate, "call init() first!"))
);
}
char *result = new char[32];
vh2->Finalize((unsigned char *)result);
args.GetReturnValue().Set(Nan::NewBuffer(result, 32).ToLocalChecked());
}
void verusDigestV2b(const v8::FunctionCallbackInfo<Value>& args) {
Isolate* isolate = Isolate::GetCurrent();
HandleScope scope(isolate);
if (initialized == false){
isolate->ThrowException(
Exception::TypeError(String::NewFromUtf8(isolate, "call init() first!"))
);
if (initialized == false) {
initialize();
}
char *result = new char[32];
vh2->Finalize2b((unsigned char *)result);
args.GetReturnValue().Set(Nan::NewBuffer(result, 32).ToLocalChecked());
}
void verusResetV2(const v8::FunctionCallbackInfo<Value>& args) {
Isolate* isolate = Isolate::GetCurrent();
HandleScope scope(isolate);
if (initialized == false){
isolate->ThrowException(
Exception::TypeError(String::NewFromUtf8(isolate, "call init() first!"))
);
}
vh2->Reset();
args.GetReturnValue().Set(args.This());
vh2->Write((const unsigned char *)buff, node::Buffer::Length(buffer));
vh2->Finalize((unsigned char *)result);
args.GetReturnValue().Set(Nan::NewBuffer(result, 32).ToLocalChecked());
}
void verusHashV2(const v8::FunctionCallbackInfo<Value>& args) {
void verusHashV2b(const v8::FunctionCallbackInfo<Value>& args) {
Isolate* isolate = Isolate::GetCurrent();
HandleScope scope(isolate);
if (args.Length() < 1) {
@ -187,7 +117,14 @@ void verusHashV2(const v8::FunctionCallbackInfo<Value>& args) {
);
return;
}
Local<Object> buffer = args[0]->ToObject();
MaybeLocal<Object> maybeBuffer = Nan::To<v8::Object>(args[0]);
Local<Object> buffer;
if (maybeBuffer.ToLocal(&buffer) != true) {
isolate->ThrowException(
Exception::TypeError(String::NewFromUtf8(isolate, "Invalid buffer objects."))
);
return;
}
if(!node::Buffer::HasInstance(buffer)) {
isolate->ThrowException(
Exception::TypeError(String::NewFromUtf8(isolate, "Invalid buffer objects."))
@ -205,11 +142,11 @@ void verusHashV2(const v8::FunctionCallbackInfo<Value>& args) {
vh2->Reset();
vh2->Write((const unsigned char *)buff, node::Buffer::Length(buffer));
vh2->Finalize((unsigned char *)result);
vh2->Finalize2b((unsigned char *)result);
args.GetReturnValue().Set(Nan::NewBuffer(result, 32).ToLocalChecked());
}
void verusHashV2b(const v8::FunctionCallbackInfo<Value>& args) {
void verusHashV2b1(const v8::FunctionCallbackInfo<Value>& args) {
Isolate* isolate = Isolate::GetCurrent();
HandleScope scope(isolate);
if (args.Length() < 1) {
@ -218,7 +155,14 @@ void verusHashV2b(const v8::FunctionCallbackInfo<Value>& args) {
);
return;
}
Local<Object> buffer = args[0]->ToObject();
MaybeLocal<Object> maybeBuffer = Nan::To<v8::Object>(args[0]);
Local<Object> buffer;
if (maybeBuffer.ToLocal(&buffer) != true) {
isolate->ThrowException(
Exception::TypeError(String::NewFromUtf8(isolate, "Invalid buffer objects."))
);
return;
}
if(!node::Buffer::HasInstance(buffer)) {
isolate->ThrowException(
Exception::TypeError(String::NewFromUtf8(isolate, "Invalid buffer objects."))
@ -234,24 +178,19 @@ void verusHashV2b(const v8::FunctionCallbackInfo<Value>& args) {
initialize();
}
vh2->Reset();
vh2->Write((const unsigned char *)buff, node::Buffer::Length(buffer));
vh2->Finalize2b((unsigned char *)result);
vh2b1->Reset();
vh2b1->Write((const unsigned char *)buff, node::Buffer::Length(buffer));
vh2b1->Finalize2b((unsigned char *)result);
args.GetReturnValue().Set(Nan::NewBuffer(result, 32).ToLocalChecked());
}
void Init(Handle<Object> exports) {
NODE_SET_METHOD(exports, "init", verusInit);
NODE_SET_METHOD(exports, "update", verusUpdate);
NODE_SET_METHOD(exports, "digest", verusDigest);
NODE_SET_METHOD(exports, "reset", verusReset);
NODE_SET_METHOD(exports, "hash", verusHash);
NODE_SET_METHOD(exports, "update2", verusUpdateV2);
NODE_SET_METHOD(exports, "digest2", verusDigestV2);
NODE_SET_METHOD(exports, "digest2b", verusDigestV2b);
NODE_SET_METHOD(exports, "reset2", verusResetV2);
NODE_SET_METHOD(exports, "hash2", verusHashV2);
NODE_SET_METHOD(exports, "hash2b", verusHashV2b);
NODE_SET_METHOD(exports, "hash", verusHash); //VerusHash V1
NODE_SET_METHOD(exports, "hash2", verusHashV2); //VerusHash V2
NODE_SET_METHOD(exports, "hash2b", verusHashV2b); //VerusHash V2B
NODE_SET_METHOD(exports, "hash2b1", verusHashV2b1); //VerusHash V2B1
}
NODE_MODULE(verushash, Init)

Loading…
Cancel
Save