Support VerusHash V2

6 years ago · 664230d2b7
17 changed files with 3712 additions and 168 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
 .lock-wscript
 build/
 crypto.node
+.vscode/settings.json
--- a/binding.gyp
+++ b/binding.gyp
@ -8,10 +8,18 @@
                "crypto/common.h",
                "crypto/haraka.h",
                "crypto/haraka_portable.h",
+                "crypto/verus_clhash.h",
                "crypto/verus_hash.h",
                "crypto/haraka.c",
                "crypto/haraka_portable.c",
+                "crypto/tinyformat.h",
+                "crypto/uint256.cpp",
+                "crypto/uint256.h",
+                "crypto/utilstrencodings.cpp",
+                "crypto/utilstrencodings.h",
                "crypto/verus_hash.cpp",
+                "crypto/verus_clhash.cpp",
+                "crypto/verus_clhash_portable.cpp",
                "verushash.cc",
            ],
            "include_dirs": [
@ -31,6 +39,7 @@
                "-msse4.2",
                "-mssse3",
                "-mavx",
+                "-mpclmul",
                "-maes",
            ],
            "cflags": [
@ -44,6 +53,7 @@
                "-msse4.2",
                "-mssse3",
                "-mavx",
+                "-mpclmul",
                "-maes",
            ],
            "link_settings": {
--- a/crypto/haraka.c
+++ b/crypto/haraka.c
@ -140,6 +140,34 @@ void haraka256(unsigned char *out, const unsigned char *in) {
  STORE(out + 16, s[1]);
 }

+void haraka256_keyed(unsigned char *out, const unsigned char *in, const u128 *rc) {
+  __m128i s[2], tmp;
+
+  s[0] = LOAD(in);
+  s[1] = LOAD(in + 16);
+
+  AES2(s[0], s[1], 0);
+  MIX2(s[0], s[1]);
+
+  AES2(s[0], s[1], 4);
+  MIX2(s[0], s[1]);
+
+  AES2(s[0], s[1], 8);
+  MIX2(s[0], s[1]);
+
+  AES2(s[0], s[1], 12);
+  MIX2(s[0], s[1]);
+
+  AES2(s[0], s[1], 16);
+  MIX2(s[0], s[1]);
+
+  s[0] = _mm_xor_si128(s[0], LOAD(in));
+  s[1] = _mm_xor_si128(s[1], LOAD(in + 16));
+
+  STORE(out, s[0]);
+  STORE(out + 16, s[1]);
+}
+
 void haraka256_4x(unsigned char *out, const unsigned char *in) {
  __m128i s[4][2], tmp;

@ -397,6 +425,37 @@ void haraka512_zero(unsigned char *out, const unsigned char *in) {
  TRUNCSTORE(out, s[0], s[1], s[2], s[3]);
 }

+void haraka512_keyed(unsigned char *out, const unsigned char *in, const u128 *rc) {
+  u128 s[4], tmp;
+
+  s[0] = LOAD(in);
+  s[1] = LOAD(in + 16);
+  s[2] = LOAD(in + 32);
+  s[3] = LOAD(in + 48);
+
+  AES4(s[0], s[1], s[2], s[3], 0);
+  MIX4(s[0], s[1], s[2], s[3]);
+
+  AES4(s[0], s[1], s[2], s[3], 8);
+  MIX4(s[0], s[1], s[2], s[3]);
+
+  AES4(s[0], s[1], s[2], s[3], 16);
+  MIX4(s[0], s[1], s[2], s[3]);
+
+  AES4(s[0], s[1], s[2], s[3], 24);
+  MIX4(s[0], s[1], s[2], s[3]);
+
+  AES4(s[0], s[1], s[2], s[3], 32);
+  MIX4(s[0], s[1], s[2], s[3]);
+
+  s[0] = _mm_xor_si128(s[0], LOAD(in));
+  s[1] = _mm_xor_si128(s[1], LOAD(in + 16));
+  s[2] = _mm_xor_si128(s[2], LOAD(in + 32));
+  s[3] = _mm_xor_si128(s[3], LOAD(in + 48));
+
+  TRUNCSTORE(out, s[0], s[1], s[2], s[3]);
+}
+
 void haraka512_4x(unsigned char *out, const unsigned char *in) {
  u128 s[4][4], tmp;

--- a/crypto/haraka.h
+++ b/crypto/haraka.h
@ -104,10 +104,10 @@ extern u128 rc[40];
  s1 = _mm_unpacklo_epi32(s1, tmp);

 #define TRUNCSTORE(out, s0, s1, s2, s3) \
-  *(u64*)(out) = (u64*)(s0)[1]; \
-  *(u64*)(out + 8) = (u64*)(s1)[1]; \
-  *(u64*)(out + 16) = (u64*)(s2)[0]; \
-  *(u64*)(out + 24) = (u64*)(s3)[0];
+  *(u64*)(out) = *(((u64*)&(s0) + 1)); \
+  *(u64*)(out + 8) = *(((u64*)&(s1) + 1)); \
+  *(u64*)(out + 16) = *(((u64*)&(s2) + 0)); \
+  *(u64*)(out + 24) = *(((u64*)&(s3) + 0));

 void load_constants();
 void test_implementations();
@ -115,11 +115,13 @@ void test_implementations();
 void load_constants();

 void haraka256(unsigned char *out, const unsigned char *in);
+void haraka256_keyed(unsigned char *out, const unsigned char *in, const u128 *rc);
 void haraka256_4x(unsigned char *out, const unsigned char *in);
 void haraka256_8x(unsigned char *out, const unsigned char *in);

 void haraka512(unsigned char *out, const unsigned char *in);
 void haraka512_zero(unsigned char *out, const unsigned char *in);
+void haraka512_keyed(unsigned char *out, const unsigned char *in, const u128 *rc);
 void haraka512_4x(unsigned char *out, const unsigned char *in);
 void haraka512_8x(unsigned char *out, const unsigned char *in);

--- a/crypto/haraka_portable.c
+++ b/crypto/haraka_portable.c
@ -147,32 +147,32 @@ void tweak_constants(const unsigned char *pk_seed, const unsigned char *sk_seed,
    memcpy(rc, buf, 40*16);    
 }

-static void haraka_S_absorb(unsigned char *s, unsigned int r,
+static void haraka_S_absorb(unsigned char *s, 
                            const unsigned char *m, unsigned long long mlen,
                            unsigned char p)
 {
    unsigned long long i;
-    unsigned char t[r];
+    unsigned char t[32];

-    while (mlen >= r) {
+    while (mlen >= 32) {
        // XOR block to state
-        for (i = 0; i < r; ++i) {
+        for (i = 0; i < 32; ++i) {
            s[i] ^= m[i];
        }
        haraka512_perm(s, s);
-        mlen -= r;
-        m += r;
+        mlen -= 32;
+        m += 32;
    }

-    for (i = 0; i < r; ++i) {
+    for (i = 0; i < 32; ++i) {
        t[i] = 0;
    }
    for (i = 0; i < mlen; ++i) {
        t[i] = m[i];
    }
    t[i] = p;
-    t[r - 1] |= 128;
-    for (i = 0; i < r; ++i) {
+    t[32 - 1] |= 128;
+    for (i = 0; i < 32; ++i) {
        s[i] ^= t[i];
    }
 }
@ -199,7 +199,7 @@ void haraka_S(unsigned char *out, unsigned long long outlen,
    for (i = 0; i < 64; i++) {
        s[i] = 0;
    }
-    haraka_S_absorb(s, 32, in, inlen, 0x1F);
+    haraka_S_absorb(s, in, inlen, 0x1F);

    haraka_S_squeezeblocks(out, outlen / 32, s, 32);
    out += (outlen / 32) * 32;
@ -246,6 +246,40 @@ void haraka512_perm(unsigned char *out, const unsigned char *in)
    memcpy(out, s, 64);
 }

+void haraka512_perm_keyed(unsigned char *out, const unsigned char *in, const u128 *rc) 
+{
+    int i, j;
+
+    unsigned char s[64], tmp[16];
+
+    memcpy(s, in, 16);
+    memcpy(s + 16, in + 16, 16);
+    memcpy(s + 32, in + 32, 16);
+    memcpy(s + 48, in + 48, 16);
+
+    for (i = 0; i < 5; ++i) {
+        // aes round(s)
+        for (j = 0; j < 2; ++j) {
+            aesenc(s, (const unsigned char *)&rc[4*2*i + 4*j]);
+            aesenc(s + 16, (const unsigned char *)&rc[4*2*i + 4*j + 1]);
+            aesenc(s + 32, (const unsigned char *)&rc[4*2*i + 4*j + 2]);
+            aesenc(s + 48, (const unsigned char *)&rc[4*2*i + 4*j + 3]);
+        }
+
+        // mixing
+        unpacklo32(tmp, s, s + 16);
+        unpackhi32(s, s, s + 16);
+        unpacklo32(s + 16, s + 32, s + 48);
+        unpackhi32(s + 32, s + 32, s + 48);
+        unpacklo32(s + 48, s, s + 32);
+        unpackhi32(s, s, s + 32);
+        unpackhi32(s + 32, s + 16, tmp);
+        unpacklo32(s + 16, s + 16, tmp);
+    }
+
+    memcpy(out, s, 64);
+}
+
 void haraka512_port(unsigned char *out, const unsigned char *in)
 {
    int i;
@ -265,6 +299,25 @@ void haraka512_port(unsigned char *out, const unsigned char *in)
    memcpy(out + 24, buf + 48, 8);
 }

+void haraka512_port_keyed(unsigned char *out, const unsigned char *in, const u128 *rc)
+{
+    int i;
+
+    unsigned char buf[64];
+
+    haraka512_perm_keyed(buf, in, rc);
+    /* Feed-forward */
+    for (i = 0; i < 64; i++) {
+        buf[i] = buf[i] ^ in[i];
+    }
+
+    /* Truncated */
+    memcpy(out,      buf + 8, 8);
+    memcpy(out + 8,  buf + 24, 8);
+    memcpy(out + 16, buf + 32, 8);
+    memcpy(out + 24, buf + 48, 8);
+}
+
 void haraka512_perm_zero(unsigned char *out, const unsigned char *in) 
 {
    int i, j;
--- a/crypto/haraka_portable.h
+++ b/crypto/haraka_portable.h
@ -1,6 +1,54 @@
 #ifndef SPX_HARAKA_H
 #define SPX_HARAKA_H

+#include "immintrin.h"
+
+#define NUMROUNDS 5
+
+#ifdef _WIN32
+typedef unsigned long long u64;
+#else
+typedef unsigned long u64;
+#endif
+typedef __m128i u128;
+
+extern void aesenc(unsigned char *s, const unsigned char *rk);
+
+#define AES2_EMU(s0, s1, rci) \
+  aesenc((unsigned char *)&s0, (unsigned char *)&(rc[rci])); \
+  aesenc((unsigned char *)&s1, (unsigned char *)&(rc[rci + 1])); \
+  aesenc((unsigned char *)&s0, (unsigned char *)&(rc[rci + 2])); \
+  aesenc((unsigned char *)&s1, (unsigned char *)&(rc[rci + 3]));
+
+typedef unsigned int uint32_t;
+
+static inline __m128i _mm_unpacklo_epi32_emu(__m128i a, __m128i b)
+{
+    uint32_t result[4];
+    uint32_t *tmp1 = (uint32_t *)&a, *tmp2 = (uint32_t *)&b;
+    result[0] = tmp1[0];
+    result[1] = tmp2[0];
+    result[2] = tmp1[1];
+    result[3] = tmp2[1];
+    return *(__m128i *)result;
+}
+
+static inline __m128i _mm_unpackhi_epi32_emu(__m128i a, __m128i b)
+{
+    uint32_t result[4];
+    uint32_t *tmp1 = (uint32_t *)&a, *tmp2 = (uint32_t *)&b;
+    result[0] = tmp1[2];
+    result[1] = tmp2[2];
+    result[2] = tmp1[3];
+    result[3] = tmp2[3];
+    return *(__m128i *)result;
+}
+
+#define MIX2_EMU(s0, s1) \
+  tmp = _mm_unpacklo_epi32_emu(s0, s1); \
+  s1 = _mm_unpackhi_epi32_emu(s0, s1); \
+  s0 = tmp;
+
 /* load constants */
 void load_constants_port();

@ -18,6 +66,9 @@ void haraka512_perm(unsigned char *out, const unsigned char *in);
 /* Implementation of Haraka-512 */
 void haraka512_port(unsigned char *out, const unsigned char *in);

+/* Implementation of Haraka-512 */
+void haraka512_port_keyed(unsigned char *out, const unsigned char *in, const u128 *rc);
+
 /* Applies the 512-bit Haraka permutation to in, using zero key. */
 void haraka512_perm_zero(unsigned char *out, const unsigned char *in);

--- a/crypto/tinyformat.h
+++ b/crypto/tinyformat.h
--- a/crypto/uint256.cpp
+++ b/crypto/uint256.cpp
@ -0,0 +1,146 @@
+// Copyright (c) 2009-2010 Satoshi Nakamoto
+// Copyright (c) 2009-2014 The Bitcoin Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#include "uint256.h"
+
+#include "utilstrencodings.h"
+
+#include <stdio.h>
+#include <string.h>
+
+template <unsigned int BITS>
+base_blob<BITS>::base_blob(const std::vector<unsigned char>& vch)
+{
+    assert(vch.size() == sizeof(data));
+    memcpy(data, &vch[0], sizeof(data));
+}
+
+template <unsigned int BITS>
+std::string base_blob<BITS>::GetHex() const
+{
+    char psz[sizeof(data) * 2 + 1];
+    for (unsigned int i = 0; i < sizeof(data); i++)
+        sprintf(psz + i * 2, "%02x", data[sizeof(data) - i - 1]);
+    return std::string(psz, psz + sizeof(data) * 2);
+}
+
+template <unsigned int BITS>
+void base_blob<BITS>::SetHex(const char* psz)
+{
+    memset(data, 0, sizeof(data));
+
+    // skip leading spaces
+    while (isspace(*psz))
+        psz++;
+
+    // skip 0x
+    if (psz[0] == '0' && tolower(psz[1]) == 'x')
+        psz += 2;
+
+    // hex string to uint
+    const char* pbegin = psz;
+    while (::HexDigit(*psz) != -1)
+        psz++;
+    psz--;
+    unsigned char* p1 = (unsigned char*)data;
+    unsigned char* pend = p1 + WIDTH;
+    while (psz >= pbegin && p1 < pend) {
+        *p1 = ::HexDigit(*psz--);
+        if (psz >= pbegin) {
+            *p1 |= ((unsigned char)::HexDigit(*psz--) << 4);
+            p1++;
+        }
+    }
+}
+
+template <unsigned int BITS>
+void base_blob<BITS>::SetHex(const std::string& str)
+{
+    SetHex(str.c_str());
+}
+
+template <unsigned int BITS>
+std::string base_blob<BITS>::ToString() const
+{
+    return (GetHex());
+}
+
+// Explicit instantiations for base_blob<160>
+template base_blob<160>::base_blob(const std::vector<unsigned char>&);
+template std::string base_blob<160>::GetHex() const;
+template std::string base_blob<160>::ToString() const;
+template void base_blob<160>::SetHex(const char*);
+template void base_blob<160>::SetHex(const std::string&);
+
+// Explicit instantiations for base_blob<256>
+template base_blob<256>::base_blob(const std::vector<unsigned char>&);
+template std::string base_blob<256>::GetHex() const;
+template std::string base_blob<256>::ToString() const;
+template void base_blob<256>::SetHex(const char*);
+template void base_blob<256>::SetHex(const std::string&);
+
+static void inline HashMix(uint32_t& a, uint32_t& b, uint32_t& c)
+{
+    // Taken from lookup3, by Bob Jenkins.
+    a -= c;
+    a ^= ((c << 4) | (c >> 28));
+    c += b;
+    b -= a;
+    b ^= ((a << 6) | (a >> 26));
+    a += c;
+    c -= b;
+    c ^= ((b << 8) | (b >> 24));
+    b += a;
+    a -= c;
+    a ^= ((c << 16) | (c >> 16));
+    c += b;
+    b -= a;
+    b ^= ((a << 19) | (a >> 13));
+    a += c;
+    c -= b;
+    c ^= ((b << 4) | (b >> 28));
+    b += a;
+}
+
+static void inline HashFinal(uint32_t& a, uint32_t& b, uint32_t& c)
+{
+    // Taken from lookup3, by Bob Jenkins.
+    c ^= b;
+    c -= ((b << 14) | (b >> 18));
+    a ^= c;
+    a -= ((c << 11) | (c >> 21));
+    b ^= a;
+    b -= ((a << 25) | (a >> 7));
+    c ^= b;
+    c -= ((b << 16) | (b >> 16));
+    a ^= c;
+    a -= ((c << 4) | (c >> 28));
+    b ^= a;
+    b -= ((a << 14) | (a >> 18));
+    c ^= b;
+    c -= ((b << 24) | (b >> 8));
+}
+
+uint64_t uint256::GetHash(const uint256& salt) const
+{
+    uint32_t a, b, c;
+    const uint32_t *pn = (const uint32_t*)data;
+    const uint32_t *salt_pn = (const uint32_t*)salt.data;
+    a = b = c = 0xdeadbeef + WIDTH;
+
+    a += pn[0] ^ salt_pn[0];
+    b += pn[1] ^ salt_pn[1];
+    c += pn[2] ^ salt_pn[2];
+    HashMix(a, b, c);
+    a += pn[3] ^ salt_pn[3];
+    b += pn[4] ^ salt_pn[4];
+    c += pn[5] ^ salt_pn[5];
+    HashMix(a, b, c);
+    a += pn[6] ^ salt_pn[6];
+    b += pn[7] ^ salt_pn[7];
+    HashFinal(a, b, c);
+
+    return ((((uint64_t)b) << 32) | c);
+}
--- a/crypto/uint256.h
+++ b/crypto/uint256.h
@ -0,0 +1,164 @@
+// Copyright (c) 2009-2010 Satoshi Nakamoto
+// Copyright (c) 2009-2014 The Bitcoin Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#ifndef BITCOIN_UINT256_H
+#define BITCOIN_UINT256_H
+
+#include <assert.h>
+#include <cstring>
+#include <stdexcept>
+#include <stdint.h>
+#include <string>
+#include <vector>
+
+#ifdef _MSC_VER
+# define _ALIGN(x) __declspec(align(x))
+#else
+# define _ALIGN(x) __attribute__ ((aligned(x)))
+#endif
+
+/** Template base class for fixed-sized opaque blobs. */
+template<unsigned int BITS>
+class base_blob
+{
+protected:
+    enum { WIDTH=BITS/8 };
+    uint8_t _ALIGN(4) data[WIDTH];
+public:
+    base_blob()
+    {
+        memset(data, 0, sizeof(data));
+    }
+
+    explicit base_blob(const std::vector<unsigned char>& vch);
+
+    bool IsNull() const
+    {
+        for (int i = 0; i < WIDTH; i++)
+            if (data[i] != 0)
+                return false;
+        return true;
+    }
+
+    void SetNull()
+    {
+        memset(data, 0, sizeof(data));
+    }
+
+    friend inline bool operator==(const base_blob& a, const base_blob& b) { return memcmp(a.data, b.data, sizeof(a.data)) == 0; }
+    friend inline bool operator!=(const base_blob& a, const base_blob& b) { return memcmp(a.data, b.data, sizeof(a.data)) != 0; }
+    friend inline bool operator<(const base_blob& a, const base_blob& b) { return memcmp(a.data, b.data, sizeof(a.data)) < 0; }
+
+    std::string GetHex() const;
+    void SetHex(const char* psz);
+    void SetHex(const std::string& str);
+    std::string ToString() const;
+
+    unsigned char* begin()
+    {
+        return &data[0];
+    }
+
+    unsigned char* end()
+    {
+        return &data[WIDTH];
+    }
+
+    const unsigned char* begin() const
+    {
+        return &data[0];
+    }
+
+    const unsigned char* end() const
+    {
+        return &data[WIDTH];
+    }
+
+    unsigned int size() const
+    {
+        return sizeof(data);
+    }
+
+    unsigned int GetSerializeSize(int nType, int nVersion) const
+    {
+        return sizeof(data);
+    }
+
+    template<typename Stream>
+    void Serialize(Stream& s, int nType, int nVersion) const
+    {
+        s.write((char*)data, sizeof(data));
+    }
+
+    template<typename Stream>
+    void Unserialize(Stream& s, int nType, int nVersion)
+    {
+        s.read((char*)data, sizeof(data));
+    }
+};
+
+/** 160-bit opaque blob.
+ * @note This type is called uint160 for historical reasons only. It is an opaque
+ * blob of 160 bits and has no integer operations.
+ */
+class uint160 : public base_blob<160> {
+public:
+    uint160() {}
+    uint160(const base_blob<160>& b) : base_blob<160>(b) {}
+    explicit uint160(const std::vector<unsigned char>& vch) : base_blob<160>(vch) {}
+};
+
+/** 256-bit opaque blob.
+ * @note This type is called uint256 for historical reasons only. It is an
+ * opaque blob of 256 bits and has no integer operations. Use arith_uint256 if
+ * those are required.
+ */
+class uint256 : public base_blob<256> {
+public:
+    uint256() {}
+    uint256(const base_blob<256>& b) : base_blob<256>(b) {}
+    explicit uint256(const std::vector<unsigned char>& vch) : base_blob<256>(vch) {}
+
+    /** A cheap hash function that just returns 64 bits from the result, it can be
+     * used when the contents are considered uniformly random. It is not appropriate
+     * when the value can easily be influenced from outside as e.g. a network adversary could
+     * provide values to trigger worst-case behavior.
+     * @note The result of this function is not stable between little and big endian.
+     */
+    uint64_t GetCheapHash() const
+    {
+        uint64_t result;
+        memcpy((void*)&result, (void*)data, 8);
+        return result;
+    }
+
+    /** A more secure, salted hash function.
+     * @note This hash is not stable between little and big endian.
+     */
+    uint64_t GetHash(const uint256& salt) const;
+};
+
+/* uint256 from const char *.
+ * This is a separate function because the constructor uint256(const char*) can result
+ * in dangerously catching uint256(0).
+ */
+inline uint256 uint256S(const char *str)
+{
+    uint256 rv;
+    rv.SetHex(str);
+    return rv;
+}
+/* uint256 from std::string.
+ * This is a separate function because the constructor uint256(const std::string &str) can result
+ * in dangerously catching uint256(0) via std::string(const char*).
+ */
+inline uint256 uint256S(const std::string& str)
+{
+    uint256 rv;
+    rv.SetHex(str);
+    return rv;
+}
+
+#endif // BITCOIN_UINT256_H
--- a/crypto/utilstrencodings.cpp
+++ b/crypto/utilstrencodings.cpp
@ -0,0 +1,499 @@
+// Copyright (c) 2009-2010 Satoshi Nakamoto
+// Copyright (c) 2009-2014 The Bitcoin Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#include "utilstrencodings.h"
+
+#include "tinyformat.h"
+
+#include <cstdlib>
+#include <cstring>
+#include <errno.h>
+#include <limits>
+
+using namespace std;
+
+string SanitizeString(const string& str)
+{
+    /**
+     * safeChars chosen to allow simple messages/URLs/email addresses, but avoid anything
+     * even possibly remotely dangerous like & or >
+     */
+    static string safeChars("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890 .,;_/:?@()");
+    string strResult;
+    for (std::string::size_type i = 0; i < str.size(); i++)
+    {
+        if (safeChars.find(str[i]) != std::string::npos)
+            strResult.push_back(str[i]);
+    }
+    return strResult;
+}
+
+const signed char p_util_hexdigit[256] =
+{ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  0,1,2,3,4,5,6,7,8,9,-1,-1,-1,-1,-1,-1,
+  -1,0xa,0xb,0xc,0xd,0xe,0xf,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,0xa,0xb,0xc,0xd,0xe,0xf,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+  -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, };
+
+signed char HexDigit(char c)
+{
+    return p_util_hexdigit[(unsigned char)c];
+}
+
+bool IsHex(const string& str)
+{
+    for(std::string::const_iterator it(str.begin()); it != str.end(); ++it)
+    {
+        if (HexDigit(*it) < 0)
+            return false;
+    }
+    return (str.size() > 0) && (str.size()%2 == 0);
+}
+
+vector<unsigned char> ParseHex(const char* psz)
+{
+    // convert hex dump to vector
+    vector<unsigned char> vch;
+    while (true)
+    {
+        while (isspace(*psz))
+            psz++;
+        signed char c = HexDigit(*psz++);
+        if (c == (signed char)-1)
+            break;
+        unsigned char n = (c << 4);
+        c = HexDigit(*psz++);
+        if (c == (signed char)-1)
+            break;
+        n |= c;
+        vch.push_back(n);
+    }
+    return vch;
+}
+
+vector<unsigned char> ParseHex(const string& str)
+{
+    return ParseHex(str.c_str());
+}
+
+string EncodeBase64(const unsigned char* pch, size_t len)
+{
+    static const char *pbase64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+    string strRet="";
+    strRet.reserve((len+2)/3*4);
+
+    int mode=0, left=0;
+    const unsigned char *pchEnd = pch+len;
+
+    while (pch<pchEnd)
+    {
+        int enc = *(pch++);
+        switch (mode)
+        {
+            case 0: // we have no bits
+                strRet += pbase64[enc >> 2];
+                left = (enc & 3) << 4;
+                mode = 1;
+                break;
+
+            case 1: // we have two bits
+                strRet += pbase64[left | (enc >> 4)];
+                left = (enc & 15) << 2;
+                mode = 2;
+                break;
+
+            case 2: // we have four bits
+                strRet += pbase64[left | (enc >> 6)];
+                strRet += pbase64[enc & 63];
+                mode = 0;
+                break;
+        }
+    }
+
+    if (mode)
+    {
+        strRet += pbase64[left];
+        strRet += '=';
+        if (mode == 1)
+            strRet += '=';
+    }
+
+    return strRet;
+}
+
+string EncodeBase64(const string& str)
+{
+    return EncodeBase64((const unsigned char*)str.c_str(), str.size());
+}
+
+vector<unsigned char> DecodeBase64(const char* p, bool* pfInvalid)
+{
+    static const int decode64_table[256] =
+    {
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+        -1, -1, -1, 62, -1, -1, -1, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1,
+        -1, -1, -1, -1, -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
+        15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1, 26, 27, 28,
+        29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+        49, 50, 51, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+    };
+
+    if (pfInvalid)
+        *pfInvalid = false;
+
+    vector<unsigned char> vchRet;
+    vchRet.reserve(strlen(p)*3/4);
+
+    int mode = 0;
+    int left = 0;
+
+    while (1)
+    {
+         int dec = decode64_table[(unsigned char)*p];
+         if (dec == -1) break;
+         p++;
+         switch (mode)
+         {
+             case 0: // we have no bits and get 6
+                 left = dec;
+                 mode = 1;
+                 break;
+
+              case 1: // we have 6 bits and keep 4
+                  vchRet.push_back((left<<2) | (dec>>4));
+                  left = dec & 15;
+                  mode = 2;
+                  break;
+
+             case 2: // we have 4 bits and get 6, we keep 2
+                 vchRet.push_back((left<<4) | (dec>>2));
+                 left = dec & 3;
+                 mode = 3;
+                 break;
+
+             case 3: // we have 2 bits and get 6
+                 vchRet.push_back((left<<6) | dec);
+                 mode = 0;
+                 break;
+         }
+    }
+
+    if (pfInvalid)
+        switch (mode)
+        {
+            case 0: // 4n base64 characters processed: ok
+                break;
+
+            case 1: // 4n+1 base64 character processed: impossible
+                *pfInvalid = true;
+                break;
+
+            case 2: // 4n+2 base64 characters processed: require '=='
+                if (left || p[0] != '=' || p[1] != '=' || decode64_table[(unsigned char)p[2]] != -1)
+                    *pfInvalid = true;
+                break;
+
+            case 3: // 4n+3 base64 characters processed: require '='
+                if (left || p[0] != '=' || decode64_table[(unsigned char)p[1]] != -1)
+                    *pfInvalid = true;
+                break;
+        }
+
+    return vchRet;
+}
+
+string DecodeBase64(const string& str)
+{
+    vector<unsigned char> vchRet = DecodeBase64(str.c_str());
+    return (vchRet.size() == 0) ? string() : string((const char*)&vchRet[0], vchRet.size());
+}
+
+string EncodeBase32(const unsigned char* pch, size_t len)
+{
+    static const char *pbase32 = "abcdefghijklmnopqrstuvwxyz234567";
+
+    string strRet="";
+    strRet.reserve((len+4)/5*8);
+
+    int mode=0, left=0;
+    const unsigned char *pchEnd = pch+len;
+
+    while (pch<pchEnd)
+    {
+        int enc = *(pch++);
+        switch (mode)
+        {
+            case 0: // we have no bits
+                strRet += pbase32[enc >> 3];
+                left = (enc & 7) << 2;
+                mode = 1;
+                break;
+
+            case 1: // we have three bits
+                strRet += pbase32[left | (enc >> 6)];
+                strRet += pbase32[(enc >> 1) & 31];
+                left = (enc & 1) << 4;
+                mode = 2;
+                break;
+
+            case 2: // we have one bit
+                strRet += pbase32[left | (enc >> 4)];
+                left = (enc & 15) << 1;
+                mode = 3;
+                break;
+
+            case 3: // we have four bits
+                strRet += pbase32[left | (enc >> 7)];
+                strRet += pbase32[(enc >> 2) & 31];
+                left = (enc & 3) << 3;
+                mode = 4;
+                break;
+
+            case 4: // we have two bits
+                strRet += pbase32[left | (enc >> 5)];
+                strRet += pbase32[enc & 31];
+                mode = 0;
+        }
+    }
+
+    static const int nPadding[5] = {0, 6, 4, 3, 1};
+    if (mode)
+    {
+        strRet += pbase32[left];
+        for (int n=0; n<nPadding[mode]; n++)
+             strRet += '=';
+    }
+
+    return strRet;
+}
+
+string EncodeBase32(const string& str)
+{
+    return EncodeBase32((const unsigned char*)str.c_str(), str.size());
+}
+
+vector<unsigned char> DecodeBase32(const char* p, bool* pfInvalid)
+{
+    static const int decode32_table[256] =
+    {
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 26, 27, 28, 29, 30, 31, -1, -1, -1, -1,
+        -1, -1, -1, -1, -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
+        15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1,  0,  1,  2,
+         3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+        23, 24, 25, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+    };
+
+    if (pfInvalid)
+        *pfInvalid = false;
+
+    vector<unsigned char> vchRet;
+    vchRet.reserve((strlen(p))*5/8);
+
+    int mode = 0;
+    int left = 0;
+
+    while (1)
+    {
+         int dec = decode32_table[(unsigned char)*p];
+         if (dec == -1) break;
+         p++;
+         switch (mode)
+         {
+             case 0: // we have no bits and get 5
+                 left = dec;
+                 mode = 1;
+                 break;
+
+              case 1: // we have 5 bits and keep 2
+                  vchRet.push_back((left<<3) | (dec>>2));
+                  left = dec & 3;
+                  mode = 2;
+                  break;
+
+             case 2: // we have 2 bits and keep 7
+                 left = left << 5 | dec;
+                 mode = 3;
+                 break;
+
+             case 3: // we have 7 bits and keep 4
+                 vchRet.push_back((left<<1) | (dec>>4));
+                 left = dec & 15;
+                 mode = 4;
+                 break;
+
+             case 4: // we have 4 bits, and keep 1
+                 vchRet.push_back((left<<4) | (dec>>1));
+                 left = dec & 1;
+                 mode = 5;
+                 break;
+
+             case 5: // we have 1 bit, and keep 6
+                 left = left << 5 | dec;
+                 mode = 6;
+                 break;
+
+             case 6: // we have 6 bits, and keep 3
+                 vchRet.push_back((left<<2) | (dec>>3));
+                 left = dec & 7;
+                 mode = 7;
+                 break;
+
+             case 7: // we have 3 bits, and keep 0
+                 vchRet.push_back((left<<5) | dec);
+                 mode = 0;
+                 break;
+         }
+    }
+
+    if (pfInvalid)
+        switch (mode)
+        {
+            case 0: // 8n base32 characters processed: ok
+                break;
+
+            case 1: // 8n+1 base32 characters processed: impossible
+            case 3: //   +3
+            case 6: //   +6
+                *pfInvalid = true;
+                break;
+
+            case 2: // 8n+2 base32 characters processed: require '======'
+                if (left || p[0] != '=' || p[1] != '=' || p[2] != '=' || p[3] != '=' || p[4] != '=' || p[5] != '=' || decode32_table[(unsigned char)p[6]] != -1)
+                    *pfInvalid = true;
+                break;
+
+            case 4: // 8n+4 base32 characters processed: require '===='
+                if (left || p[0] != '=' || p[1] != '=' || p[2] != '=' || p[3] != '=' || decode32_table[(unsigned char)p[4]] != -1)
+                    *pfInvalid = true;
+                break;
+
+            case 5: // 8n+5 base32 characters processed: require '==='
+                if (left || p[0] != '=' || p[1] != '=' || p[2] != '=' || decode32_table[(unsigned char)p[3]] != -1)
+                    *pfInvalid = true;
+                break;
+
+            case 7: // 8n+7 base32 characters processed: require '='
+                if (left || p[0] != '=' || decode32_table[(unsigned char)p[1]] != -1)
+                    *pfInvalid = true;
+                break;
+        }
+
+    return vchRet;
+}
+
+string DecodeBase32(const string& str)
+{
+    vector<unsigned char> vchRet = DecodeBase32(str.c_str());
+    return (vchRet.size() == 0) ? string() : string((const char*)&vchRet[0], vchRet.size());
+}
+
+bool ParseInt32(const std::string& str, int32_t *out)
+{
+    char *endp = NULL;
+    errno = 0; // strtol will not set errno if valid
+    long int n = strtol(str.c_str(), &endp, 10);
+    if(out) *out = (int)n;
+    // Note that strtol returns a *long int*, so even if strtol doesn't report a over/underflow
+    // we still have to check that the returned value is within the range of an *int32_t*. On 64-bit
+    // platforms the size of these types may be different.
+    return endp && *endp == 0 && !errno &&
+        n >= std::numeric_limits<int32_t>::min() &&
+        n <= std::numeric_limits<int32_t>::max();
+}
+
+std::string FormatParagraph(const std::string in, size_t width, size_t indent)
+{
+    std::stringstream out;
+    size_t col = 0;
+    size_t ptr = 0;
+    while(ptr < in.size())
+    {
+        // Find beginning of next word
+        ptr = in.find_first_not_of(' ', ptr);
+        if (ptr == std::string::npos)
+            break;
+        // Find end of next word
+        size_t endword = in.find_first_of(' ', ptr);
+        if (endword == std::string::npos)
+            endword = in.size();
+        // Add newline and indentation if this wraps over the allowed width
+        if (col > 0)
+        {
+            if ((col + endword - ptr) > width)
+            {
+                out << '\n';
+                for(size_t i=0; i<indent; ++i)
+                    out << ' ';
+                col = 0;
+            } else
+                out << ' ';
+        }
+        // Append word
+        out << in.substr(ptr, endword - ptr);
+        col += endword - ptr + 1;
+        ptr = endword;
+    }
+    return out.str();
+}
+
+std::string i64tostr(int64_t n)
+{
+    return strprintf("%d", n);
+}
+
+std::string itostr(int n)
+{
+    return strprintf("%d", n);
+}
+
+int64_t atoi64(const char* psz)
+{
+#ifdef _MSC_VER
+    return _atoi64(psz);
+#else
+    return strtoll(psz, NULL, 10);
+#endif
+}
+
+int64_t atoi64(const std::string& str)
+{
+#ifdef _MSC_VER
+    return _atoi64(str.c_str());
+#else
+    return strtoll(str.c_str(), NULL, 10);
+#endif
+}
+
+int atoi(const std::string& str)
+{
+    return atoi(str.c_str());
+}
--- a/crypto/utilstrencodings.h
+++ b/crypto/utilstrencodings.h
@ -0,0 +1,98 @@
+// Copyright (c) 2009-2010 Satoshi Nakamoto
+// Copyright (c) 2009-2014 The Bitcoin Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+/**
+ * Utilities for converting data from/to strings.
+ */
+#ifndef BITCOIN_UTILSTRENCODINGS_H
+#define BITCOIN_UTILSTRENCODINGS_H
+
+#include <stdint.h>
+#include <string>
+#include <vector>
+
+#define BEGIN(a)            ((char*)&(a))
+#define END(a)              ((char*)&((&(a))[1]))
+#define UBEGIN(a)           ((unsigned char*)&(a))
+#define UEND(a)             ((unsigned char*)&((&(a))[1]))
+#define ARRAYLEN(array)     (sizeof(array)/sizeof((array)[0]))
+
+/** This is needed because the foreach macro can't get over the comma in pair<t1, t2> */
+#define PAIRTYPE(t1, t2)    std::pair<t1, t2>
+
+std::string SanitizeString(const std::string& str);
+std::vector<unsigned char> ParseHex(const char* psz);
+std::vector<unsigned char> ParseHex(const std::string& str);
+signed char HexDigit(char c);
+bool IsHex(const std::string& str);
+std::vector<unsigned char> DecodeBase64(const char* p, bool* pfInvalid = NULL);
+std::string DecodeBase64(const std::string& str);
+std::string EncodeBase64(const unsigned char* pch, size_t len);
+std::string EncodeBase64(const std::string& str);
+std::vector<unsigned char> DecodeBase32(const char* p, bool* pfInvalid = NULL);
+std::string DecodeBase32(const std::string& str);
+std::string EncodeBase32(const unsigned char* pch, size_t len);
+std::string EncodeBase32(const std::string& str);
+
+std::string i64tostr(int64_t n);
+std::string itostr(int n);
+int64_t atoi64(const char* psz);
+int64_t atoi64(const std::string& str);
+int atoi(const std::string& str);
+
+/**
+ * Convert string to signed 32-bit integer with strict parse error feedback.
+ * @returns true if the entire string could be parsed as valid integer,
+ *   false if not the entire string could be parsed or when overflow or underflow occurred.
+ */
+bool ParseInt32(const std::string& str, int32_t *out);
+
+template<typename T>
+std::string HexStr(const T itbegin, const T itend, bool fSpaces=false)
+{
+    std::string rv;
+    static const char hexmap[16] = { '0', '1', '2', '3', '4', '5', '6', '7',
+                                     '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
+    rv.reserve((itend-itbegin)*3);
+    for(T it = itbegin; it < itend; ++it)
+    {
+        unsigned char val = (unsigned char)(*it);
+        if(fSpaces && it != itbegin)
+            rv.push_back(' ');
+        rv.push_back(hexmap[val>>4]);
+        rv.push_back(hexmap[val&15]);
+    }
+
+    return rv;
+}
+
+template<typename T>
+inline std::string HexStr(const T& vch, bool fSpaces=false)
+{
+    return HexStr(vch.begin(), vch.end(), fSpaces);
+}
+
+/** 
+ * Format a paragraph of text to a fixed width, adding spaces for
+ * indentation to any added line.
+ */
+std::string FormatParagraph(const std::string in, size_t width=79, size_t indent=0);
+
+/**
+ * Timing-attack-resistant comparison.
+ * Takes time proportional to length
+ * of first argument.
+ */
+template <typename T>
+bool TimingResistantEqual(const T& a, const T& b)
+{
+    if (b.size() == 0) return a.size() == 0;
+    size_t accumulator = a.size() ^ b.size();
+    for (size_t i = 0; i < a.size(); i++)
+        accumulator |= a[i] ^ b[i%b.size()];
+    return accumulator == 0;
+}
+
+#endif // BITCOIN_UTILSTRENCODINGS_H
--- a/crypto/verus_clhash.cpp
+++ b/crypto/verus_clhash.cpp
@ -0,0 +1,355 @@
+/*
+ * This uses veriations of the clhash algorithm for Verus Coin, licensed
+ * with the Apache-2.0 open source license.
+ * 
+ * Copyright (c) 2018 Michael Toutonghi
+ * Distributed under the Apache 2.0 software license, available in the original form for clhash
+ * here: https://github.com/lemire/clhash/commit/934da700a2a54d8202929a826e2763831bd43cf7#diff-9879d6db96fd29134fc802214163b95a
+ * 
+ * Original CLHash code and any portions herein, (C) 2017, 2018 Daniel Lemire and Owen Kaser
+ * Faster 64-bit universal hashing
+ * using carry-less multiplications, Journal of Cryptographic Engineering (to appear)
+ *
+ * Best used on recent x64 processors (Haswell or better).
+ * 
+ * This implements an intermediate step in the last part of a Verus block hash. The intent of this step
+ * is to more effectively equalize FPGAs over GPUs and CPUs.
+ *
+ **/
+
+
+#include "verus_hash.h"
+
+#include <boost/thread.hpp>
+
+#include <assert.h>
+#include <string.h>
+#include <x86intrin.h>
+
+#ifdef __WIN32
+#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno)
+#endif
+
+thread_local thread_specific_ptr verusclhasher_key;
+thread_local thread_specific_ptr verusclhasher_descr;
+
+#ifdef _WIN32
+// attempt to workaround horrible mingw/gcc destructor bug on Windows, which passes garbage in the this pointer
+// we use the opportunity of control here to clean up all of our tls variables. we could keep a list, but this is a quick hack
+thread_specific_ptr::~thread_specific_ptr() {
+    if (verusclhasher_key.ptr)
+    {
+        verusclhasher_key.reset();
+    }
+    if (verusclhasher_descr.ptr)
+    {
+        verusclhasher_descr.reset();
+    }
+}
+#endif
+
+int __cpuverusoptimized = 0x80;
+
+// multiply the length and the some key, no modulo
+static inline __m128i lazyLengthHash(uint64_t keylength, uint64_t length) {
+    const __m128i lengthvector = _mm_set_epi64x(keylength,length);
+    const __m128i clprod1 = _mm_clmulepi64_si128( lengthvector, lengthvector, 0x10);
+    return clprod1;
+}
+
+// modulo reduction to 64-bit value. The high 64 bits contain garbage, see precompReduction64
+static inline __m128i precompReduction64_si128( __m128i A) {
+
+    //const __m128i C = _mm_set_epi64x(1U,(1U<<4)+(1U<<3)+(1U<<1)+(1U<<0)); // C is the irreducible poly. (64,4,3,1,0)
+    const __m128i C = _mm_cvtsi64_si128((1U<<4)+(1U<<3)+(1U<<1)+(1U<<0));
+    __m128i Q2 = _mm_clmulepi64_si128( A, C, 0x01);
+    __m128i Q3 = _mm_shuffle_epi8(_mm_setr_epi8(0, 27, 54, 45, 108, 119, 90, 65, (char)216, (char)195, (char)238, (char)245, (char)180, (char)175, (char)130, (char)153),
+                                  _mm_srli_si128(Q2,8));
+    __m128i Q4 = _mm_xor_si128(Q2,A);
+    const __m128i final = _mm_xor_si128(Q3,Q4);
+    return final;/// WARNING: HIGH 64 BITS CONTAIN GARBAGE
+}
+
+static inline uint64_t precompReduction64( __m128i A) {
+    return _mm_cvtsi128_si64(precompReduction64_si128(A));
+}
+
+// verus intermediate hash extra
+static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask)
+{
+    __m128i const *pbuf;
+
+    // divide key mask by 16 from bytes to __m128i
+    keyMask >>= 4;
+
+    // the random buffer must have at least 32 16 byte dwords after the keymask to work with this
+    // algorithm. we take the value from the last element inside the keyMask + 2, as that will never
+    // be used to xor into the accumulator before it is hashed with other values first
+    __m128i acc = _mm_load_si128(randomsource + (keyMask + 2));
+
+    for (int64_t i = 0; i < 32; i++)
+    {
+        const uint64_t selector = _mm_cvtsi128_si64(acc);
+
+        // get two random locations in the key, which will be mutated and swapped
+        __m128i *prand = randomsource + ((selector >> 5) & keyMask);
+        __m128i *prandex = randomsource + ((selector >> 32) & keyMask);
+
+        // select random start and order of pbuf processing
+        pbuf = buf + (selector & 3);
+
+        switch (selector & 0x1c)
+        {
+            case 0:
+            {
+                const __m128i temp1 = _mm_load_si128(prandex);
+                const __m128i temp2 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
+                const __m128i add1 = _mm_xor_si128(temp1, temp2);
+                const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
+                acc = _mm_xor_si128(clprod1, acc);
+
+                const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
+                const __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
+
+                const __m128i temp12 = _mm_load_si128(prand);
+                _mm_store_si128(prand, tempa2);
+
+                const __m128i temp22 = _mm_load_si128(pbuf);
+                const __m128i add12 = _mm_xor_si128(temp12, temp22);
+                const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10);
+                acc = _mm_xor_si128(clprod12, acc);
+
+                const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
+                const __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
+                _mm_store_si128(prandex, tempb2);
+                break;
+            }
+            case 4:
+            {
+                const __m128i temp1 = _mm_load_si128(prand);
+                const __m128i temp2 = _mm_load_si128(pbuf);
+                const __m128i add1 = _mm_xor_si128(temp1, temp2);
+                const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
+                acc = _mm_xor_si128(clprod1, acc);
+                const __m128i clprod2 = _mm_clmulepi64_si128(temp2, temp2, 0x10);
+                acc = _mm_xor_si128(clprod2, acc);
+
+                const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
+                const __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
+
+                const __m128i temp12 = _mm_load_si128(prandex);
+                _mm_store_si128(prandex, tempa2);
+
+                const __m128i temp22 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
+                const __m128i add12 = _mm_xor_si128(temp12, temp22);
+                acc = _mm_xor_si128(add12, acc);
+
+                const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
+                const __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
+                _mm_store_si128(prand, tempb2);
+                break;
+            }
+            case 8:
+            {
+                const __m128i temp1 = _mm_load_si128(prandex);
+                const __m128i temp2 = _mm_load_si128(pbuf);
+                const __m128i add1 = _mm_xor_si128(temp1, temp2);
+                acc = _mm_xor_si128(add1, acc);
+
+                const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
+                const __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
+
+                const __m128i temp12 = _mm_load_si128(prand);
+                _mm_store_si128(prand, tempa2);
+
+                const __m128i temp22 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
+                const __m128i add12 = _mm_xor_si128(temp12, temp22);
+                const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10);
+                acc = _mm_xor_si128(clprod12, acc);
+                const __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10);
+                acc = _mm_xor_si128(clprod22, acc);
+
+                const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
+                const __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
+                _mm_store_si128(prandex, tempb2);
+                break;
+            }
+            case 0xc:
+            {
+                const __m128i temp1 = _mm_load_si128(prand);
+                const __m128i temp2 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
+                const __m128i add1 = _mm_xor_si128(temp1, temp2);
+
+                // cannot be zero here
+                const int32_t divisor = (uint32_t)selector;
+
+                acc = _mm_xor_si128(add1, acc);
+
+                const int64_t dividend = _mm_cvtsi128_si64(acc);
+                const __m128i modulo = _mm_cvtsi32_si128(dividend % divisor);
+                acc = _mm_xor_si128(modulo, acc);
+
+                const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
+                const __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
+
+                if (dividend & 1)
+                {
+                    const __m128i temp12 = _mm_load_si128(prandex);
+                    _mm_store_si128(prandex, tempa2);
+
+                    const __m128i temp22 = _mm_load_si128(pbuf);
+                    const __m128i add12 = _mm_xor_si128(temp12, temp22);
+                    const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10);
+                    acc = _mm_xor_si128(clprod12, acc);
+                    const __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10);
+                    acc = _mm_xor_si128(clprod22, acc);
+
+                    const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
+                    const __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
+                    _mm_store_si128(prand, tempb2);
+                }
+                else
+                {
+                    const __m128i tempb3 = _mm_load_si128(prandex);
+                    _mm_store_si128(prandex, tempa2);
+                    _mm_store_si128(prand, tempb3);
+                }
+                break;
+            }
+            case 0x10:
+            {
+                // a few AES operations
+                const __m128i *rc = prand;
+                __m128i tmp;
+
+                __m128i temp1 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
+                __m128i temp2 = _mm_load_si128(pbuf);
+
+                AES2(temp1, temp2, 0);
+                MIX2(temp1, temp2);
+
+                AES2(temp1, temp2, 4);
+                MIX2(temp1, temp2);
+
+                AES2(temp1, temp2, 8);
+                MIX2(temp1, temp2);
+
+                acc = _mm_xor_si128(temp2, _mm_xor_si128(temp1, acc));
+
+                const __m128i tempa1 = _mm_load_si128(prand);
+                const __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1);
+                const __m128i tempa3 = _mm_xor_si128(tempa1, tempa2);
+
+                const __m128i tempa4 = _mm_load_si128(prandex);
+                _mm_store_si128(prandex, tempa3);
+                _mm_store_si128(prand, tempa4);
+                break;
+            }
+            case 0x14:
+            {
+                // we'll just call this one the monkins loop, inspired by Chris
+                const __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1);
+                __m128i tmp; // used by MIX2
+
+                uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times
+                __m128i *rc = prand;
+                uint64_t aesroundoffset = 0;
+                __m128i onekey;
+
+                do
+                {
+                    if (selector & (0x10000000 << rounds))
+                    {
+                        onekey = _mm_load_si128(rc++);
+                        const __m128i temp2 = _mm_load_si128(rounds & 1 ? pbuf : buftmp);
+                        const __m128i add1 = _mm_xor_si128(onekey, temp2);
+                        const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
+                        acc = _mm_xor_si128(clprod1, acc);
+                    }
+                    else
+                    {
+                        onekey = _mm_load_si128(rc++);
+                        __m128i temp2 = _mm_load_si128(rounds & 1 ? buftmp : pbuf);
+                        AES2(onekey, temp2, aesroundoffset);
+                        aesroundoffset += 4;
+                        MIX2(onekey, temp2);
+                        acc = _mm_xor_si128(onekey, acc);
+                        acc = _mm_xor_si128(temp2, acc);
+                    }
+                } while (rounds--);
+
+                const __m128i tempa1 = _mm_load_si128(prand);
+                const __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1);
+                const __m128i tempa3 = _mm_xor_si128(tempa1, tempa2);
+
+                const __m128i tempa4 = _mm_load_si128(prandex);
+                _mm_store_si128(prandex, tempa3);
+                _mm_store_si128(prand, tempa4);
+                break;
+            }
+            case 0x18:
+            {
+                const __m128i temp1 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
+                const __m128i temp2 = _mm_load_si128(prand);
+                const __m128i add1 = _mm_xor_si128(temp1, temp2);
+                const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
+                acc = _mm_xor_si128(clprod1, acc);
+
+                const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp2);
+                const __m128i tempa2 = _mm_xor_si128(tempa1, temp2);
+
+                const __m128i tempb3 = _mm_load_si128(prandex);
+                _mm_store_si128(prandex, tempa2);
+                _mm_store_si128(prand, tempb3);
+                break;
+            }
+            case 0x1c:
+            {
+                const __m128i temp1 = _mm_load_si128(pbuf);
+                const __m128i temp2 = _mm_load_si128(prandex);
+                const __m128i add1 = _mm_xor_si128(temp1, temp2);
+                const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
+                acc = _mm_xor_si128(clprod1, acc);
+
+                const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp2);
+                const __m128i tempa2 = _mm_xor_si128(tempa1, temp2);
+
+                const __m128i tempa3 = _mm_load_si128(prand);
+                _mm_store_si128(prand, tempa2);
+
+                acc = _mm_xor_si128(tempa3, acc);
+
+                const __m128i tempb1 = _mm_mulhrs_epi16(acc, tempa3);
+                const __m128i tempb2 = _mm_xor_si128(tempb1, tempa3);
+                _mm_store_si128(prandex, tempb2);
+                break;
+            }
+        }
+    }
+    return acc;
+}
+
+// hashes 64 bytes only by doing a carryless multiplication and reduction of the repeated 64 byte sequence 16 times, 
+// returning a 64 bit hash value
+uint64_t verusclhash(void * random, const unsigned char buf[64], uint64_t keyMask) {
+    __m128i  acc = __verusclmulwithoutreduction64alignedrepeat((__m128i *)random, (const __m128i *)buf, keyMask);
+    acc = _mm_xor_si128(acc, lazyLengthHash(1024, 64));
+    return precompReduction64(acc);
+}
+
+#ifdef __WIN32
+#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno)
+#endif
+
+void *alloc_aligned_buffer(uint64_t bufSize)
+{
+    void *answer = NULL;
+    if (posix_memalign(&answer, sizeof(__m256i), bufSize))
+    {
+        return NULL;
+    }
+    else
+    {
+        return answer;
+    }
+}
--- a/crypto/verus_clhash.h
+++ b/crypto/verus_clhash.h
@ -0,0 +1,239 @@
+/*
+ * This uses veriations of the clhash algorithm for Verus Coin, licensed
+ * with the Apache-2.0 open source license.
+ * 
+ * Copyright (c) 2018 Michael Toutonghi
+ * Distributed under the Apache 2.0 software license, available in the original form for clhash
+ * here: https://github.com/lemire/clhash/commit/934da700a2a54d8202929a826e2763831bd43cf7#diff-9879d6db96fd29134fc802214163b95a
+ * 
+ * CLHash is a very fast hashing function that uses the
+ * carry-less multiplication and SSE instructions.
+ *
+ * Original CLHash code (C) 2017, 2018 Daniel Lemire and Owen Kaser
+ * Faster 64-bit universal hashing
+ * using carry-less multiplications, Journal of Cryptographic Engineering (to appear)
+ *
+ * Best used on recent x64 processors (Haswell or better).
+ *
+ **/
+
+#ifndef INCLUDE_VERUS_CLHASH_H
+#define INCLUDE_VERUS_CLHASH_H
+
+#ifndef _WIN32
+#include <cpuid.h>
+#else
+#include <intrin.h>
+#endif // !WIN32
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <stddef.h>
+#include <assert.h>
+#include <boost/thread.hpp>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _WIN32
+#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno)
+typedef unsigned char u_char;
+#endif
+
+enum {
+    // Verus Key size must include the equivalent size of a Haraka key
+    // after the first part.
+    // Any excess over a power of 2 will not get mutated, and any excess over
+    // power of 2 + Haraka sized key will not be used
+    VERUSKEYSIZE=1024 * 8 + (40 * 16),
+    VERUSHHASH_SOLUTION_VERSION = 1
+};
+
+struct verusclhash_descr
+{
+    uint256 seed;
+    uint32_t keySizeInBytes;
+};
+
+struct thread_specific_ptr {
+    void *ptr;
+    thread_specific_ptr() { ptr = NULL; }
+    void reset(void *newptr = NULL)
+    {
+        if (ptr && ptr != newptr)
+        {
+            std::free(ptr);
+        }
+        ptr = newptr;
+    }
+    void *get() { return ptr; }
+#ifdef _WIN32 // horrible MingW and gcc thread local storage bug workaround
+    ~thread_specific_ptr();
+#else
+    ~thread_specific_ptr() {
+        this->reset();
+    }
+#endif
+};
+
+extern thread_local thread_specific_ptr verusclhasher_key;
+extern thread_local thread_specific_ptr verusclhasher_descr;
+
+extern int __cpuverusoptimized;
+
+inline bool IsCPUVerusOptimized()
+{
+    if (__cpuverusoptimized & 0x80)
+    {
+#ifdef _WIN32
+        #define bit_AVX		(1 << 28)
+        #define bit_AES		(1 << 25)
+        #define bit_PCLMUL  (1 << 1)
+        // https://insufficientlycomplicated.wordpress.com/2011/11/07/detecting-intel-advanced-vector-extensions-avx-in-visual-studio/
+        // bool cpuAVXSuport = cpuInfo[2] & (1 << 28) || false;
+
+        int cpuInfo[4];
+		__cpuid(cpuInfo, 1);
+        __cpuverusoptimized = ((cpuInfo[2] & (bit_AVX | bit_AES | bit_PCLMUL)) == (bit_AVX | bit_AES | bit_PCLMUL));
+#else
+        unsigned int eax,ebx,ecx,edx;
+
+        if (!__get_cpuid(1,&eax,&ebx,&ecx,&edx))
+        {
+            __cpuverusoptimized = false;
+        }
+        else
+        {
+            __cpuverusoptimized = ((ecx & (bit_AVX | bit_AES | bit_PCLMUL)) == (bit_AVX | bit_AES | bit_PCLMUL));
+        }
+#endif //WIN32
+    }
+    return __cpuverusoptimized;
+};
+
+inline void ForceCPUVerusOptimized(bool trueorfalse)
+{
+    __cpuverusoptimized = trueorfalse;
+};
+
+uint64_t verusclhash(void * random, const unsigned char buf[64], uint64_t keyMask);
+uint64_t verusclhash_port(void * random, const unsigned char buf[64], uint64_t keyMask);
+
+void *alloc_aligned_buffer(uint64_t bufSize);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#ifdef __cplusplus
+
+#include <vector>
+#include <string>
+
+// special high speed hasher for VerusHash 2.0
+struct verusclhasher {
+    uint64_t keySizeInBytes;
+    uint64_t keyMask;
+    uint64_t (*verusclhashfunction)(void * random, const unsigned char buf[64], uint64_t keyMask);
+
+    inline uint64_t keymask(uint64_t keysize)
+    {
+        int i = 0;
+        while (keysize >>= 1)
+        {
+            i++;
+        }
+        return i ? (((uint64_t)1) << i) - 1 : 0;
+    }
+
+    // align on 256 bit boundary at end
+    verusclhasher(uint64_t keysize=VERUSKEYSIZE) : keySizeInBytes((keysize >> 5) << 5)
+    {
+        if (IsCPUVerusOptimized())
+        {
+            verusclhashfunction = &verusclhash;
+        }
+        else
+        {
+            verusclhashfunction = &verusclhash_port;
+        }
+
+        // if we changed, change it
+        if (verusclhasher_key.get() && keySizeInBytes != ((verusclhash_descr *)verusclhasher_descr.get())->keySizeInBytes)
+        {
+            verusclhasher_key.reset();
+            verusclhasher_descr.reset();
+        }
+        // get buffer space for mutating and refresh keys
+        void *key = NULL;
+        if (!(key = verusclhasher_key.get()) && 
+            (verusclhasher_key.reset((unsigned char *)alloc_aligned_buffer(keySizeInBytes << 1)), key = verusclhasher_key.get()))
+        {
+            verusclhash_descr *pdesc;
+            if (verusclhasher_descr.reset(new verusclhash_descr()), pdesc = (verusclhash_descr *)verusclhasher_descr.get())
+            {
+                pdesc->keySizeInBytes = keySizeInBytes;
+            }
+            else
+            {
+                verusclhasher_key.reset();
+                key = NULL;
+            }
+        }
+        if (key)
+        {
+            keyMask = keymask(keySizeInBytes);
+        }
+        else
+        {
+            keyMask = 0;
+            keySizeInBytes = 0;
+        }
+#ifdef VERUSHASHDEBUG
+        printf("New hasher, keyMask: %lx, newKeySize: %lx\n", keyMask, keySizeInBytes);
+#endif
+    }
+
+    // this prepares a key for hashing and mutation by copying it from the original key for this block
+    // WARNING!! this does not check for NULL ptr, so make sure the buffer is allocated
+    inline void *gethashkey()
+    {
+        unsigned char *ret = (unsigned char *)verusclhasher_key.get();
+        verusclhash_descr *pdesc = (verusclhash_descr *)verusclhasher_descr.get();
+        memcpy(ret, ret + pdesc->keySizeInBytes, keyMask + 1);
+#ifdef VERUSHASHDEBUG
+        // in debug mode, ensure that what should be the same, is
+        assert(memcmp(ret + (keyMask + 1), ret + (pdesc->keySizeInBytes + keyMask + 1), verusclhasher_keySizeInBytes - (keyMask + 1)) == 0);
+#endif
+        return ret;
+    }
+
+    inline void *gethasherrefresh()
+    {
+        verusclhash_descr *pdesc = (verusclhash_descr *)verusclhasher_descr.get();
+        return (unsigned char *)verusclhasher_key.get() + pdesc->keySizeInBytes;
+    }
+
+    inline verusclhash_descr *gethasherdescription()
+    {
+        return (verusclhash_descr *)verusclhasher_descr.get();
+    }
+
+    inline uint64_t keyrefreshsize()
+    {
+        return keyMask + 1;
+    }
+
+    inline uint64_t operator()(const unsigned char buf[64]) const {
+        return (*verusclhashfunction)(verusclhasher_key.get(), buf, keyMask);
+    }
+
+    inline uint64_t operator()(const unsigned char buf[64], void *key) const {
+        return (*verusclhashfunction)(key, buf, keyMask);
+    }
+};
+
+#endif // #ifdef __cplusplus
+
+#endif // INCLUDE_VERUS_CLHASH_H
--- a/crypto/verus_clhash_portable.cpp
+++ b/crypto/verus_clhash_portable.cpp
@ -0,0 +1,591 @@
+/*
+ * This uses veriations of the clhash algorithm for Verus Coin, licensed
+ * with the Apache-2.0 open source license.
+ * 
+ * Copyright (c) 2018 Michael Toutonghi
+ * Distributed under the Apache 2.0 software license, available in the original form for clhash
+ * here: https://github.com/lemire/clhash/commit/934da700a2a54d8202929a826e2763831bd43cf7#diff-9879d6db96fd29134fc802214163b95a
+ * 
+ * Original CLHash code and any portions herein, (C) 2017, 2018 Daniel Lemire and Owen Kaser
+ * Faster 64-bit universal hashing
+ * using carry-less multiplications, Journal of Cryptographic Engineering (to appear)
+ *
+ * Best used on recent x64 processors (Haswell or better).
+ * 
+ * This implements an intermediate step in the last part of a Verus block hash. The intent of this step
+ * is to more effectively equalize FPGAs over GPUs and CPUs.
+ *
+ **/
+
+
+#include "verus_hash.h"
+
+#include <assert.h>
+#include <string.h>
+
+#ifdef __APPLE__
+#include <sys/types.h>
+#endif// APPLE
+
+#ifdef _WIN32
+#pragma warning (disable : 4146)
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif //WIN32
+
+void clmul64(uint64_t a, uint64_t b, uint64_t* r)
+{
+    uint8_t s = 4,i; //window size
+    uint64_t two_s = 1 << s; //2^s
+    uint64_t smask = two_s-1; //s 1 bits
+    uint64_t u[16];
+    uint64_t tmp;
+    uint64_t ifmask;
+    //Precomputation
+    u[0] = 0;
+    u[1] = b;
+    for(i = 2 ; i < two_s; i += 2){
+        u[i] = u[i >> 1] << 1; //even indices: left shift
+        u[i + 1] = u[i] ^ b; //odd indices: xor b
+    }
+    //Multiply
+    r[0] = u[a & smask]; //first window only affects lower word
+    r[1] = 0;
+    for(i = s ; i < 64 ; i += s){
+        tmp = u[a >> i & smask];     
+        r[0] ^= tmp << i;
+        r[1] ^= tmp >> (64 - i);
+    }
+    //Repair
+    uint64_t m = 0xEEEEEEEEEEEEEEEE; //s=4 => 16 times 1110
+    for(i = 1 ; i < s ; i++){
+        tmp = ((a & m) >> i);
+        m &= m << 1; //shift mask to exclude all bit j': j' mod s = i
+        ifmask = -((b >> (64-i)) & 1); //if the (64-i)th bit of b is 1
+        r[1] ^= (tmp & ifmask);
+    }
+}
+
+u128 _mm_clmulepi64_si128_emu(const __m128i &a, const __m128i &b, int imm)
+{
+    uint64_t result[2];
+    clmul64(*((uint64_t*)&a + (imm & 1)), *((uint64_t*)&b + ((imm & 0x10) >> 4)), result);
+
+    /*
+    // TEST
+    const __m128i tmp1 = _mm_load_si128(&a);
+    const __m128i tmp2 = _mm_load_si128(&b);
+    imm = imm & 0x11;
+    const __m128i testresult = (imm == 0x10) ? _mm_clmulepi64_si128(tmp1, tmp2, 0x10) : ((imm == 0x01) ? _mm_clmulepi64_si128(tmp1, tmp2, 0x01) : ((imm == 0x00) ? _mm_clmulepi64_si128(tmp1, tmp2, 0x00) : _mm_clmulepi64_si128(tmp1, tmp2, 0x11)));
+    if (!memcmp(&testresult, &result, 16))
+    {
+        printf("_mm_clmulepi64_si128_emu: Portable version passed!\n");
+    }
+    else
+    {
+        printf("_mm_clmulepi64_si128_emu: Portable version failed! a: %lxh %lxl, b: %lxh %lxl, imm: %x, emu: %lxh %lxl, intrin: %lxh %lxl\n", 
+               *((uint64_t *)&a + 1), *(uint64_t *)&a,
+               *((uint64_t *)&b + 1), *(uint64_t *)&b,
+               imm,
+               *((uint64_t *)result + 1), *(uint64_t *)result,
+               *((uint64_t *)&testresult + 1), *(uint64_t *)&testresult);
+        return testresult;
+    }
+    */
+
+    return *(__m128i *)result;
+}
+
+u128 _mm_mulhrs_epi16_emu(__m128i _a, __m128i _b)
+{
+    int16_t result[8];
+    int16_t *a = (int16_t*)&_a, *b = (int16_t*)&_b;
+    for (int i = 0; i < 8; i ++)
+    {
+        result[i] = (int16_t)((((int32_t)(a[i]) * (int32_t)(b[i])) + 0x4000) >> 15);
+    }
+
+    /*
+    const __m128i testresult = _mm_mulhrs_epi16(_a, _b);
+    if (!memcmp(&testresult, &result, 16))
+    {
+        printf("_mm_mulhrs_epi16_emu: Portable version passed!\n");
+    }
+    else
+    {
+        printf("_mm_mulhrs_epi16_emu: Portable version failed! a: %lxh %lxl, b: %lxh %lxl, emu: %lxh %lxl, intrin: %lxh %lxl\n", 
+               *((uint64_t *)&a + 1), *(uint64_t *)&a,
+               *((uint64_t *)&b + 1), *(uint64_t *)&b,
+               *((uint64_t *)result + 1), *(uint64_t *)result,
+               *((uint64_t *)&testresult + 1), *(uint64_t *)&testresult);
+    }
+    */
+
+    return *(__m128i *)result;
+}
+
+inline u128 _mm_set_epi64x_emu(uint64_t hi, uint64_t lo)
+{
+    __m128i result;
+    ((uint64_t *)&result)[0] = lo;
+    ((uint64_t *)&result)[1] = hi;
+    return result;
+}
+
+inline u128 _mm_cvtsi64_si128_emu(uint64_t lo)
+{
+    __m128i result;
+    ((uint64_t *)&result)[0] = lo;
+    ((uint64_t *)&result)[1] = 0;
+    return result;
+}
+
+inline int64_t _mm_cvtsi128_si64_emu(__m128i &a)
+{
+    return *(int64_t *)&a;
+}
+
+inline int32_t _mm_cvtsi128_si32_emu(__m128i &a)
+{
+    return *(int32_t *)&a;
+}
+
+inline u128 _mm_cvtsi32_si128_emu(uint32_t lo)
+{
+    __m128i result;
+    ((uint32_t *)&result)[0] = lo;
+    ((uint32_t *)&result)[1] = 0;
+    ((uint64_t *)&result)[1] = 0;
+
+    /*
+    const __m128i testresult = _mm_cvtsi32_si128(lo);
+    if (!memcmp(&testresult, &result, 16))
+    {
+        printf("_mm_cvtsi32_si128_emu: Portable version passed!\n");
+    }
+    else
+    {
+        printf("_mm_cvtsi32_si128_emu: Portable version failed!\n");
+    }
+    */
+
+    return result;
+}
+
+u128 _mm_setr_epi8_emu(u_char c0, u_char c1, u_char c2, u_char c3, u_char c4, u_char c5, u_char c6, u_char c7, u_char c8, u_char c9, u_char c10, u_char c11, u_char c12, u_char c13, u_char c14, u_char c15)
+{
+    __m128i result;
+    ((uint8_t *)&result)[0] = c0;
+    ((uint8_t *)&result)[1] = c1;
+    ((uint8_t *)&result)[2] = c2;
+    ((uint8_t *)&result)[3] = c3;
+    ((uint8_t *)&result)[4] = c4;
+    ((uint8_t *)&result)[5] = c5;
+    ((uint8_t *)&result)[6] = c6;
+    ((uint8_t *)&result)[7] = c7;
+    ((uint8_t *)&result)[8] = c8;
+    ((uint8_t *)&result)[9] = c9;
+    ((uint8_t *)&result)[10] = c10;
+    ((uint8_t *)&result)[11] = c11;
+    ((uint8_t *)&result)[12] = c12;
+    ((uint8_t *)&result)[13] = c13;
+    ((uint8_t *)&result)[14] = c14;
+    ((uint8_t *)&result)[15] = c15;
+
+    /*
+    const __m128i testresult = _mm_setr_epi8(c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15);
+    if (!memcmp(&testresult, &result, 16))
+    {
+        printf("_mm_setr_epi8_emu: Portable version passed!\n");
+    }
+    else
+    {
+        printf("_mm_setr_epi8_emu: Portable version failed!\n");
+    }
+    */
+
+    return result;
+}
+
+inline __m128i _mm_srli_si128_emu(__m128i a, int imm8)
+{
+    unsigned char result[16];
+    uint8_t shift = imm8 & 0xff;
+    if (shift > 15) shift = 16;
+
+    int i;
+    for (i = 0; i < (16 - shift); i++)
+    {
+        result[i] = ((unsigned char *)&a)[shift + i];
+    }
+    for ( ; i < 16; i++)
+    {
+        result[i] = 0;
+    }
+
+    /*
+    const __m128i tmp1 = _mm_load_si128(&a);
+    __m128i testresult = _mm_srli_si128(tmp1, imm8);
+    if (!memcmp(&testresult, result, 16))
+    {
+        printf("_mm_srli_si128_emu: Portable version passed!\n");
+    }
+    else
+    {
+        printf("_mm_srli_si128_emu: Portable version failed! val: %lx%lx imm: %x emu: %lx%lx, intrin: %lx%lx\n", 
+               *((uint64_t *)&a + 1), *(uint64_t *)&a,
+               imm8,
+               *((uint64_t *)result + 1), *(uint64_t *)result,
+               *((uint64_t *)&testresult + 1), *(uint64_t *)&testresult);
+    }
+    */
+
+    return *(__m128i *)result;
+}
+
+inline __m128i _mm_xor_si128_emu(__m128i a, __m128i b)
+{
+#ifdef _WIN32
+    uint64_t result[2];
+    result[0] = *(uint64_t *)&a ^ *(uint64_t *)&b;
+    result[1] = *((uint64_t *)&a + 1) ^ *((uint64_t *)&b + 1);
+    return *(__m128i *)result;
+#else
+    return a ^ b;
+#endif
+}
+
+inline __m128i _mm_load_si128_emu(const void *p)
+{
+    return *(__m128i *)p;
+}
+
+inline void _mm_store_si128_emu(void *p, __m128i val)
+{
+    *(__m128i *)p = val;
+}
+
+__m128i _mm_shuffle_epi8_emu(__m128i a, __m128i b)
+{
+    __m128i result;
+    for (int i = 0; i < 16; i++)
+    {
+        if (((uint8_t *)&b)[i] & 0x80)
+        {
+            ((uint8_t *)&result)[i] = 0;
+        }
+        else
+        {
+            ((uint8_t *)&result)[i] = ((uint8_t *)&a)[((uint8_t *)&b)[i] & 0xf];
+        }
+    }
+
+    /*
+    const __m128i tmp1 = _mm_load_si128(&a);
+    const __m128i tmp2 = _mm_load_si128(&b);
+    __m128i testresult = _mm_shuffle_epi8(tmp1, tmp2);
+    if (!memcmp(&testresult, &result, 16))
+    {
+        printf("_mm_shuffle_epi8_emu: Portable version passed!\n");
+    }
+    else
+    {
+        printf("_mm_shuffle_epi8_emu: Portable version failed!\n");
+    }
+    */
+
+    return result;
+}
+
+// portable
+static inline __m128i lazyLengthHash_port(uint64_t keylength, uint64_t length) {
+    const __m128i lengthvector = _mm_set_epi64x_emu(keylength,length);
+    const __m128i clprod1 = _mm_clmulepi64_si128_emu( lengthvector, lengthvector, 0x10);
+    return clprod1;
+}
+
+// modulo reduction to 64-bit value. The high 64 bits contain garbage, see precompReduction64
+static inline __m128i precompReduction64_si128_port( __m128i A) {
+
+    //const __m128i C = _mm_set_epi64x(1U,(1U<<4)+(1U<<3)+(1U<<1)+(1U<<0)); // C is the irreducible poly. (64,4,3,1,0)
+    const __m128i C = _mm_cvtsi64_si128_emu((1U<<4)+(1U<<3)+(1U<<1)+(1U<<0));
+    __m128i Q2 = _mm_clmulepi64_si128_emu( A, C, 0x01);
+    __m128i Q3 = _mm_shuffle_epi8_emu(_mm_setr_epi8_emu(0, 27, 54, 45, 108, 119, 90, 65, (char)216, (char)195, (char)238, (char)245, (char)180, (char)175, (char)130, (char)153),
+                                  _mm_srli_si128_emu(Q2,8));
+    __m128i Q4 = _mm_xor_si128_emu(Q2,A);
+    const __m128i final = _mm_xor_si128_emu(Q3,Q4);
+    return final;/// WARNING: HIGH 64 BITS SHOULD BE ASSUMED TO CONTAIN GARBAGE
+}
+
+static inline uint64_t precompReduction64_port( __m128i A) {
+    __m128i tmp = precompReduction64_si128_port(A);
+    return _mm_cvtsi128_si64_emu(tmp);
+}
+
+// verus intermediate hash extra
+static __m128i __verusclmulwithoutreduction64alignedrepeat_port(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask)
+{
+    __m128i const *pbuf;
+
+    // divide key mask by 16 from bytes to __m128i
+    keyMask >>= 4;
+
+    // the random buffer must have at least 32 16 byte dwords after the keymask to work with this
+    // algorithm. we take the value from the last element inside the keyMask + 2, as that will never
+    // be used to xor into the accumulator before it is hashed with other values first
+    __m128i acc = _mm_load_si128_emu(randomsource + (keyMask + 2));
+
+    for (int64_t i = 0; i < 32; i++)
+    {
+        const uint64_t selector = _mm_cvtsi128_si64_emu(acc);
+
+        // get two random locations in the key, which will be mutated and swapped
+        __m128i *prand = randomsource + ((selector >> 5) & keyMask);
+        __m128i *prandex = randomsource + ((selector >> 32) & keyMask);
+
+        // select random start and order of pbuf processing
+        pbuf = buf + (selector & 3);
+
+        switch (selector & 0x1c)
+        {
+            case 0:
+            {
+                const __m128i temp1 = _mm_load_si128_emu(prandex);
+                const __m128i temp2 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
+                const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
+                const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
+                acc = _mm_xor_si128_emu(clprod1, acc);
+
+                const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
+                const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
+
+                const __m128i temp12 = _mm_load_si128_emu(prand);
+                _mm_store_si128_emu(prand, tempa2);
+
+                const __m128i temp22 = _mm_load_si128_emu(pbuf);
+                const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
+                const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10);
+                acc = _mm_xor_si128_emu(clprod12, acc);
+
+                const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
+                const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
+                _mm_store_si128_emu(prandex, tempb2);
+                break;
+            }
+            case 4:
+            {
+                const __m128i temp1 = _mm_load_si128_emu(prand);
+                const __m128i temp2 = _mm_load_si128_emu(pbuf);
+                const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
+                const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
+                acc = _mm_xor_si128_emu(clprod1, acc);
+                const __m128i clprod2 = _mm_clmulepi64_si128_emu(temp2, temp2, 0x10);
+                acc = _mm_xor_si128_emu(clprod2, acc);
+
+                const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
+                const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
+
+                const __m128i temp12 = _mm_load_si128_emu(prandex);
+                _mm_store_si128_emu(prandex, tempa2);
+
+                const __m128i temp22 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
+                const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
+                acc = _mm_xor_si128_emu(add12, acc);
+
+                const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
+                const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
+                _mm_store_si128_emu(prand, tempb2);
+                break;
+            }
+            case 8:
+            {
+                const __m128i temp1 = _mm_load_si128_emu(prandex);
+                const __m128i temp2 = _mm_load_si128_emu(pbuf);
+                const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
+                acc = _mm_xor_si128_emu(add1, acc);
+
+                const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
+                const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
+
+                const __m128i temp12 = _mm_load_si128_emu(prand);
+                _mm_store_si128_emu(prand, tempa2);
+
+                const __m128i temp22 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
+                const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
+                const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10);
+                acc = _mm_xor_si128_emu(clprod12, acc);
+                const __m128i clprod22 = _mm_clmulepi64_si128_emu(temp22, temp22, 0x10);
+                acc = _mm_xor_si128_emu(clprod22, acc);
+
+                const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
+                const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
+                _mm_store_si128_emu(prandex, tempb2);
+                break;
+            }
+            case 0xc:
+            {
+                const __m128i temp1 = _mm_load_si128_emu(prand);
+                const __m128i temp2 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
+                const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
+
+                // cannot be zero here
+                const int32_t divisor = (uint32_t)selector;
+
+                acc = _mm_xor_si128(add1, acc);
+
+                const int64_t dividend = _mm_cvtsi128_si64_emu(acc);
+                const __m128i modulo = _mm_cvtsi32_si128_emu(dividend % divisor);
+                acc = _mm_xor_si128_emu(modulo, acc);
+
+                const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp1);
+                const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp1);
+
+                if (dividend & 1)
+                {
+                    const __m128i temp12 = _mm_load_si128_emu(prandex);
+                    _mm_store_si128_emu(prandex, tempa2);
+
+                    const __m128i temp22 = _mm_load_si128_emu(pbuf);
+                    const __m128i add12 = _mm_xor_si128_emu(temp12, temp22);
+                    const __m128i clprod12 = _mm_clmulepi64_si128_emu(add12, add12, 0x10);
+                    acc = _mm_xor_si128_emu(clprod12, acc);
+                    const __m128i clprod22 = _mm_clmulepi64_si128_emu(temp22, temp22, 0x10);
+                    acc = _mm_xor_si128_emu(clprod22, acc);
+
+                    const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, temp12);
+                    const __m128i tempb2 = _mm_xor_si128_emu(tempb1, temp12);
+                    _mm_store_si128_emu(prand, tempb2);
+                }
+                else
+                {
+                    const __m128i tempb3 = _mm_load_si128_emu(prandex);
+                    _mm_store_si128_emu(prandex, tempa2);
+                    _mm_store_si128_emu(prand, tempb3);
+                }
+                break;
+            }
+            case 0x10:
+            {
+                // a few AES operations
+                const __m128i *rc = prand;
+                __m128i tmp;
+
+                __m128i temp1 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
+                __m128i temp2 = _mm_load_si128_emu(pbuf);
+
+                AES2_EMU(temp1, temp2, 0);
+                MIX2_EMU(temp1, temp2);
+
+                AES2_EMU(temp1, temp2, 4);
+                MIX2_EMU(temp1, temp2);
+
+                AES2_EMU(temp1, temp2, 8);
+                MIX2_EMU(temp1, temp2);
+
+                acc = _mm_xor_si128_emu(temp1, acc);
+                acc = _mm_xor_si128_emu(temp2, acc);
+
+                const __m128i tempa1 = _mm_load_si128_emu(prand);
+                const __m128i tempa2 = _mm_mulhrs_epi16_emu(acc, tempa1);
+                const __m128i tempa3 = _mm_xor_si128_emu(tempa1, tempa2);
+
+                const __m128i tempa4 = _mm_load_si128_emu(prandex);
+                _mm_store_si128_emu(prandex, tempa3);
+                _mm_store_si128_emu(prand, tempa4);
+                break;
+            }
+            case 0x14:
+            {
+                // we'll just call this one the monkins loop, inspired by Chris
+                const __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1);
+                __m128i tmp; // used by MIX2
+
+                uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times
+                __m128i *rc = prand;
+                uint64_t aesround = 0;
+                __m128i onekey;
+
+                do
+                {
+                    if (selector & (0x10000000 << rounds))
+                    {
+                        onekey = _mm_load_si128_emu(rc++);
+                        const __m128i temp2 = _mm_load_si128_emu(rounds & 1 ? pbuf : buftmp);
+                        const __m128i add1 = _mm_xor_si128_emu(onekey, temp2);
+                        const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
+                        acc = _mm_xor_si128_emu(clprod1, acc);
+                    }
+                    else
+                    {
+                        onekey = _mm_load_si128_emu(rc++);
+                        __m128i temp2 = _mm_load_si128_emu(rounds & 1 ? buftmp : pbuf);
+                        const uint64_t roundidx = aesround++ << 2;
+                        AES2_EMU(onekey, temp2, roundidx);
+                        MIX2_EMU(onekey, temp2);
+                        acc = _mm_xor_si128_emu(onekey, acc);
+                        acc = _mm_xor_si128_emu(temp2, acc);
+                    }
+                } while (rounds--);
+
+                const __m128i tempa1 = _mm_load_si128_emu(prand);
+                const __m128i tempa2 = _mm_mulhrs_epi16_emu(acc, tempa1);
+                const __m128i tempa3 = _mm_xor_si128_emu(tempa1, tempa2);
+
+                const __m128i tempa4 = _mm_load_si128_emu(prandex);
+                _mm_store_si128_emu(prandex, tempa3);
+                _mm_store_si128_emu(prand, tempa4);
+                break;
+            }
+            case 0x18:
+            {
+                const __m128i temp1 = _mm_load_si128_emu(pbuf - (((selector & 1) << 1) - 1));
+                const __m128i temp2 = _mm_load_si128_emu(prand);
+                const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
+                const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
+                acc = _mm_xor_si128_emu(clprod1, acc);
+
+                const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp2);
+                const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp2);
+
+                const __m128i tempb3 = _mm_load_si128_emu(prandex);
+                _mm_store_si128_emu(prandex, tempa2);
+                _mm_store_si128_emu(prand, tempb3);
+                break;
+            }
+            case 0x1c:
+            {
+                const __m128i temp1 = _mm_load_si128_emu(pbuf);
+                const __m128i temp2 = _mm_load_si128_emu(prandex);
+                const __m128i add1 = _mm_xor_si128_emu(temp1, temp2);
+                const __m128i clprod1 = _mm_clmulepi64_si128_emu(add1, add1, 0x10);
+                acc = _mm_xor_si128_emu(clprod1, acc);
+
+                const __m128i tempa1 = _mm_mulhrs_epi16_emu(acc, temp2);
+                const __m128i tempa2 = _mm_xor_si128_emu(tempa1, temp2);
+
+                const __m128i tempa3 = _mm_load_si128_emu(prand);
+                _mm_store_si128_emu(prand, tempa2);
+
+                acc = _mm_xor_si128_emu(tempa3, acc);
+
+                const __m128i tempb1 = _mm_mulhrs_epi16_emu(acc, tempa3);
+                const __m128i tempb2 = _mm_xor_si128_emu(tempb1, tempa3);
+                _mm_store_si128_emu(prandex, tempb2);
+                break;
+            }
+        }
+    }
+    return acc;
+}
+
+// hashes 64 bytes only by doing a carryless multiplication and reduction of the repeated 64 byte sequence 16 times, 
+// returning a 64 bit hash value
+uint64_t verusclhash_port(void * random, const unsigned char buf[64], uint64_t keyMask) {
+    __m128i * rs64 = (__m128i *)random;
+    const __m128i * string = (const __m128i *) buf;
+
+    __m128i  acc = __verusclmulwithoutreduction64alignedrepeat_port(rs64, string, keyMask);
+    acc = _mm_xor_si128_emu(acc, lazyLengthHash_port(1024, 64));
+    return precompReduction64_port(acc);
+}
--- a/crypto/verus_hash.cpp
+++ b/crypto/verus_hash.cpp
@ -14,11 +14,12 @@ bit output.

 void (*CVerusHash::haraka512Function)(unsigned char *out, const unsigned char *in);

-void CVerusHash::Hash(void *result, const void *data, size_t len)
+void CVerusHash::Hash(void *result, const void *data, size_t _len)
 {
    unsigned char buf[128];
    unsigned char *bufPtr = buf;
-    int pos = 0, nextOffset = 64;
+    int nextOffset = 64;
+    uint32_t pos = 0, len = _len;
    unsigned char *bufPtr2 = bufPtr + nextOffset;
    unsigned char *ptr = (unsigned char *)data;

@ -58,14 +59,15 @@ void CVerusHash::init()
    }
 }

-CVerusHash &CVerusHash::Write(const unsigned char *data, size_t len)
+CVerusHash &CVerusHash::Write(const unsigned char *data, size_t _len)
 {
    unsigned char *tmp;
+    uint32_t pos, len = _len;

    // digest up to 32 bytes at a time
-    for ( int pos = 0; pos < len; )
+    for ( pos = 0; pos < len; )
    {
-        int room = 32 - curPos;
+        uint32_t room = 32 - curPos;

        if (len - pos >= room)
        {
@ -94,6 +96,8 @@ void verus_hash(void *result, const void *data, size_t len)
 }

 void (*CVerusHashV2::haraka512Function)(unsigned char *out, const unsigned char *in);
+void (*CVerusHashV2::haraka512KeyedFunction)(unsigned char *out, const unsigned char *in, const u128 *rc);
+void (*CVerusHashV2::haraka256Function)(unsigned char *out, const unsigned char *in);

 void CVerusHashV2::init()
 {
@ -101,12 +105,16 @@ void CVerusHashV2::init()
    {
        load_constants();
        haraka512Function = &haraka512;
+        haraka512KeyedFunction = &haraka512_keyed;
+        haraka256Function = &haraka256;
    }
    else
    {
-        // load and tweak the haraka constants
+        // load the haraka constants
        load_constants_port();
        haraka512Function = &haraka512_port;
+        haraka512KeyedFunction = &haraka512_port_keyed;
+        haraka256Function = &haraka256_port;
    }
 }

@ -147,7 +155,7 @@ CVerusHashV2 &CVerusHashV2::Write(const unsigned char *data, size_t len)
    unsigned char *tmp;

    // digest up to 32 bytes at a time
-    for ( int pos = 0; pos < len; )
+    for (int pos = 0; pos < len; )
    {
        int room = 32 - curPos;

--- a/crypto/verus_hash.h
+++ b/crypto/verus_hash.h
@ -1,4 +1,4 @@
-// (C) 2018 The Verus Developers
+// (C) 2018 Michael Toutonghi
 // Distributed under the MIT software license, see the accompanying
 // file COPYING or http://www.opensource.org/licenses/mit-license.php.

@ -8,10 +8,14 @@ This provides the PoW hash function for Verus, enabling CPU mining.
 #ifndef VERUS_HASH_H_
 #define VERUS_HASH_H_

+// verbose output when defined
+//#define VERUSHASHDEBUG 1
+
 #include <cstring>
 #include <vector>

-#include <cpuid.h>
+#include "uint256.h"
+#include "verus_clhash.h"

 extern "C" 
 {
@ -40,7 +44,7 @@ class CVerusHash
            return *this;
        }

-        inline int64_t *ExtraI64Ptr() { return (int64_t *)(curBuf + 32); }
+        int64_t *ExtraI64Ptr() { return (int64_t *)(curBuf + 32); }
        void ClearExtra()
        {
            if (curPos)
@ -73,30 +77,58 @@ class CVerusHashV2
    public:
        static void Hash(void *result, const void *data, size_t len);
        static void (*haraka512Function)(unsigned char *out, const unsigned char *in);
+        static void (*haraka512KeyedFunction)(unsigned char *out, const unsigned char *in, const u128 *rc);
+        static void (*haraka256Function)(unsigned char *out, const unsigned char *in);

        static void init();

-        CVerusHashV2() {}
+        verusclhasher vclh;
+
+        CVerusHashV2() : vclh() {
+            // we must have allocated key space, or can't run
+            if (!verusclhasher_key.get())
+            {
+                printf("ERROR: failed to allocate hash buffer - terminating\n");
+                assert(false);
+            }
+        }

        CVerusHashV2 &Write(const unsigned char *data, size_t len);

-        CVerusHashV2 &Reset()
+        inline CVerusHashV2 &Reset()
        {
            curBuf = buf1;
            result = buf2;
            curPos = 0;
            std::fill(buf1, buf1 + sizeof(buf1), 0);
+            return *this;
        }

-        int64_t *ExtraI64Ptr() { return (int64_t *)(curBuf + 32); }
-        void ClearExtra()
+        inline int64_t *ExtraI64Ptr() { return (int64_t *)(curBuf + 32); }
+        inline void ClearExtra()
        {
            if (curPos)
            {
                std::fill(curBuf + 32 + curPos, curBuf + 64, 0);
            }
        }
-        void ExtraHash(unsigned char hash[32]) { (*haraka512Function)(hash, curBuf); }
+
+        template <typename T>
+        inline void FillExtra(const T *_data)
+        {
+            unsigned char *data = (unsigned char *)_data;
+            unsigned int pos = curPos;
+            unsigned int left = 32 - pos;
+            do
+            {
+                unsigned int len = left > sizeof(T) ? sizeof(T) : left;
+                std::memcpy(curBuf + 32 + pos, data, len);
+                pos += len;
+                left -= len;
+            } while (left > 0);
+        }
+        inline void ExtraHash(unsigned char hash[32]) { (*haraka512Function)(hash, curBuf); }
+        inline void ExtraHashKeyed(unsigned char hash[32], u128 *key) { (*haraka512KeyedFunction)(hash, curBuf, key); }

        void Finalize(unsigned char hash[32])
        {
@ -109,9 +141,101 @@ class CVerusHashV2
                std::memcpy(hash, curBuf, 32);
        }

+        // chains Haraka256 from 32 bytes to fill the key
+        static u128 *GenNewCLKey(unsigned char *seedBytes32)
+        {
+            unsigned char *key = (unsigned char *)verusclhasher_key.get();
+            verusclhash_descr *pdesc = (verusclhash_descr *)verusclhasher_descr.get();
+            // skip keygen if it is the current key
+            if (pdesc->seed != *((uint256 *)seedBytes32))
+            {
+                // generate a new key by chain hashing with Haraka256 from the last curbuf
+                int n256blks = pdesc->keySizeInBytes >> 5;
+                int nbytesExtra = pdesc->keySizeInBytes & 0x1f;
+                unsigned char *pkey = key + pdesc->keySizeInBytes;
+                unsigned char *psrc = seedBytes32;
+                for (int i = 0; i < n256blks; i++)
+                {
+                    (*haraka256Function)(pkey, psrc);
+                    psrc = pkey;
+                    pkey += 32;
+                }
+                if (nbytesExtra)
+                {
+                    unsigned char buf[32];
+                    (*haraka256Function)(buf, psrc);
+                    memcpy(pkey, buf, nbytesExtra);
+                }
+                pdesc->seed = *((uint256 *)seedBytes32);
+            }
+            memcpy(key, key + pdesc->keySizeInBytes, pdesc->keySizeInBytes);
+            return (u128 *)key;
+        }
+
+        inline uint64_t IntermediateTo128Offset(uint64_t intermediate)
+        {
+            // the mask is where we wrap
+            uint64_t mask = vclh.keyMask >> 4;
+            return intermediate & mask;
+        }
+
+        void Finalize2b(unsigned char hash[32])
+        {
+            // fill buffer to the end with the beginning of it to prevent any foreknowledge of
+            // bits that may contain zero
+            FillExtra((u128 *)curBuf);
+
+#ifdef VERUSHASHDEBUG
+            uint256 *bhalf1 = (uint256 *)curBuf;
+            uint256 *bhalf2 = bhalf1 + 1;
+            printf("Curbuf: %s%s\n", bhalf1->GetHex().c_str(), bhalf2->GetHex().c_str());
+#endif
+
+            // gen new key with what is last in buffer
+            u128 *key = GenNewCLKey(curBuf);
+
+            // run verusclhash on the buffer
+            uint64_t intermediate = vclh(curBuf, key);
+
+            // fill buffer to the end with the result
+            FillExtra(&intermediate);
+
+#ifdef VERUSHASHDEBUG
+            printf("intermediate %lx\n", intermediate);
+            printf("Curbuf: %s%s\n", bhalf1->GetHex().c_str(), bhalf2->GetHex().c_str());
+            bhalf1 = (uint256 *)key;
+            bhalf2 = bhalf1 + ((vclh.keyMask + 1) >> 5);
+            printf("   Key: %s%s\n", bhalf1->GetHex().c_str(), bhalf2->GetHex().c_str());
+#endif
+
+            // get the final hash with a mutated dynamic key for each hash result
+            (*haraka512KeyedFunction)(hash, curBuf, key + IntermediateTo128Offset(intermediate));
+
+            /*
+            // TEST BEGIN
+            // test against the portable version
+            uint256 testHash1 = *(uint256 *)hash, testHash2;
+            FillExtra((u128 *)curBuf);
+            u128 *hashKey = ((u128 *)vclh.gethashkey());
+            uint64_t temp = verusclhash_port(key, curBuf, vclh.keyMask);
+            FillExtra(&temp);
+            haraka512_keyed((unsigned char *)&testHash2, curBuf, hashKey + IntermediateTo128Offset(intermediate));
+            if (testHash1 != testHash2)
+            {
+                printf("Portable version failed! intermediate1: %lx, intermediate2: %lx\n", intermediate, temp);
+            }
+            // END TEST
+            */
+        }
+
+        inline unsigned char *CurBuffer()
+        {
+            return curBuf;
+        }
+
    private:
        // only buf1, the first source, needs to be zero initialized
-        unsigned char buf1[64] = {0}, buf2[64];
+        alignas(32) unsigned char buf1[64] = {0}, buf2[64];
        unsigned char *curBuf = buf1, *result = buf2;
        size_t curPos = 0;
 };
@ -119,15 +243,4 @@ class CVerusHashV2
 extern void verus_hash(void *result, const void *data, size_t len);
 extern void verus_hash_v2(void *result, const void *data, size_t len);

-inline bool IsCPUVerusOptimized()
-{
-    unsigned int eax,ebx,ecx,edx;
-
-    if (!__get_cpuid(1,&eax,&ebx,&ecx,&edx))
-    {
-        return false;
-    }
-    return ((ecx & (bit_AVX | bit_AES)) == (bit_AVX | bit_AES));
-};
-
 #endif
--- a/verushash.cc
+++ b/verushash.cc
@ -10,13 +10,23 @@
 using namespace v8;

 CVerusHash* vh;
+CVerusHashV2* vh2;
 bool initialized = false;

-void verusInit(const v8::FunctionCallbackInfo<Value>& args) {
+void initialize()
+{
+    if (!initialized)
+    {
+        CVerusHash::init();
+        CVerusHashV2::init();
+    }
    vh = new CVerusHash();
-    vh->init();
+    vh2 = new CVerusHashV2();
    initialized = true;
+}

+void verusInit(const v8::FunctionCallbackInfo<Value>& args) {
+    initialize();
    args.GetReturnValue().Set(args.This());
 }

@ -95,14 +105,140 @@ void verusHash(const v8::FunctionCallbackInfo<Value>& args) {
    char *result = new char[32];
    
    if (initialized == false) {
-        CVerusHash::init();
-        initialized = true;
+        initialize();
    }
    verus_hash(result, buff, node::Buffer::Length(buffer));
    
    args.GetReturnValue().Set(Nan::NewBuffer(result, 32).ToLocalChecked());
 }

+void verusUpdateV2(const v8::FunctionCallbackInfo<Value>& args) {
+    Isolate* isolate = Isolate::GetCurrent();
+    HandleScope scope(isolate);
+    if (initialized == false){
+        isolate->ThrowException(
+            Exception::TypeError(String::NewFromUtf8(isolate, "call init() first!"))
+        );
+    }
+    if (args.Length() < 1) {
+        isolate->ThrowException(
+            Exception::TypeError(String::NewFromUtf8(isolate, "Wrong number of arguments"))
+        );
+        return;
+    }
+    Local<Object> buffer = args[0]->ToObject();
+    if(!node::Buffer::HasInstance(buffer)) {
+        isolate->ThrowException(
+            Exception::TypeError(String::NewFromUtf8(isolate, "Invalid buffer objects."))
+        );
+        return;
+    }
+
+    const char *buff = node::Buffer::Data(buffer);
+    vh2->Write((const unsigned char *)buff, node::Buffer::Length(buffer));
+    
+    args.GetReturnValue().Set(args.This());
+}
+
+void verusDigestV2(const v8::FunctionCallbackInfo<Value>& args) {
+    Isolate* isolate = Isolate::GetCurrent();
+    HandleScope scope(isolate);
+    if (initialized == false){
+        isolate->ThrowException(
+            Exception::TypeError(String::NewFromUtf8(isolate, "call init() first!"))
+        );
+    }
+    char *result = new char[32];
+    vh2->Finalize((unsigned char *)result);
+    args.GetReturnValue().Set(Nan::NewBuffer(result, 32).ToLocalChecked());
+}
+
+void verusDigestV2b(const v8::FunctionCallbackInfo<Value>& args) {
+    Isolate* isolate = Isolate::GetCurrent();
+    HandleScope scope(isolate);
+    if (initialized == false){
+        isolate->ThrowException(
+            Exception::TypeError(String::NewFromUtf8(isolate, "call init() first!"))
+        );
+    }
+    char *result = new char[32];
+    vh2->Finalize2b((unsigned char *)result);
+    args.GetReturnValue().Set(Nan::NewBuffer(result, 32).ToLocalChecked());
+}
+
+void verusResetV2(const v8::FunctionCallbackInfo<Value>& args) {
+    Isolate* isolate = Isolate::GetCurrent();
+    HandleScope scope(isolate);
+    if (initialized == false){
+        isolate->ThrowException(
+            Exception::TypeError(String::NewFromUtf8(isolate, "call init() first!"))
+        );
+    }
+    vh2->Reset();
+    args.GetReturnValue().Set(args.This());
+}
+
+void verusHashV2(const v8::FunctionCallbackInfo<Value>& args) {
+    Isolate* isolate = Isolate::GetCurrent();
+    HandleScope scope(isolate);
+    if (args.Length() < 1) {
+        isolate->ThrowException(
+            Exception::TypeError(String::NewFromUtf8(isolate, "Wrong number of arguments"))
+        );
+        return;
+    }
+    Local<Object> buffer = args[0]->ToObject();
+    if(!node::Buffer::HasInstance(buffer)) {
+        isolate->ThrowException(
+            Exception::TypeError(String::NewFromUtf8(isolate, "Invalid buffer objects."))
+        );
+        return;
+    }
+
+    const char *buff = node::Buffer::Data(buffer);
+
+    char *result = new char[32];
+    
+    if (initialized == false) {
+        initialize();
+    }
+
+    vh2->Reset();
+    vh2->Write((const unsigned char *)buff, node::Buffer::Length(buffer));
+    vh2->Finalize((unsigned char *)result);
+    args.GetReturnValue().Set(Nan::NewBuffer(result, 32).ToLocalChecked());
+}
+
+void verusHashV2b(const v8::FunctionCallbackInfo<Value>& args) {
+    Isolate* isolate = Isolate::GetCurrent();
+    HandleScope scope(isolate);
+    if (args.Length() < 1) {
+        isolate->ThrowException(
+            Exception::TypeError(String::NewFromUtf8(isolate, "Wrong number of arguments"))
+        );
+        return;
+    }
+    Local<Object> buffer = args[0]->ToObject();
+    if(!node::Buffer::HasInstance(buffer)) {
+        isolate->ThrowException(
+            Exception::TypeError(String::NewFromUtf8(isolate, "Invalid buffer objects."))
+        );
+        return;
+    }
+
+    const char *buff = node::Buffer::Data(buffer);
+
+    char *result = new char[32];
+    
+    if (initialized == false) {
+        initialize();
+    }
+
+    vh2->Reset();
+    vh2->Write((const unsigned char *)buff, node::Buffer::Length(buffer));
+    vh2->Finalize2b((unsigned char *)result);
+    args.GetReturnValue().Set(Nan::NewBuffer(result, 32).ToLocalChecked());
+}

 void Init(Handle<Object> exports) {
  NODE_SET_METHOD(exports, "init", verusInit);
@ -110,6 +246,12 @@ void Init(Handle<Object> exports) {
  NODE_SET_METHOD(exports, "digest", verusDigest);
  NODE_SET_METHOD(exports, "reset", verusReset);
  NODE_SET_METHOD(exports, "hash", verusHash);
+  NODE_SET_METHOD(exports, "update2", verusUpdateV2);
+  NODE_SET_METHOD(exports, "digest2", verusDigestV2);
+  NODE_SET_METHOD(exports, "digest2b", verusDigestV2b);
+  NODE_SET_METHOD(exports, "reset2", verusResetV2);
+  NODE_SET_METHOD(exports, "hash2", verusHashV2);
+  NODE_SET_METHOD(exports, "hash2b", verusHashV2b);
 }

 NODE_MODULE(verushash, Init)