diff --git a/crypto/verus_clhash.cpp b/crypto/verus_clhash.cpp
index 57182a7..92b590e 100644
--- a/crypto/verus_clhash.cpp
+++ b/crypto/verus_clhash.cpp
@@ -17,23 +17,34 @@
  *
  **/
 
-
 #include "verus_hash.h"
 
 #include <assert.h>
 #include <string.h>
+
+#ifdef _WIN32
+#pragma warning (disable : 4146)
+#include <intrin.h>
+#endif
+int __cpuverusoptimized = 0x80;
+
+#if defined(__arm__)  || defined(__aarch64__)
+#include "crypto/SSE2NEON.h"
+#else
 #include <x86intrin.h>
+#endif
 
-#ifdef __WIN32
+#ifdef _WIN32
 #define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno)
 #endif
 
 thread_local thread_specific_ptr verusclhasher_key;
 thread_local thread_specific_ptr verusclhasher_descr;
 
-#ifdef _WIN32
-// attempt to workaround horrible mingw/gcc destructor bug on Windows, which passes garbage in the this pointer
-// we use the opportunity of control here to clean up all of our tls variables. we could keep a list, but this is a quick hack
+#if defined(__APPLE__) || defined(_WIN32)
+// attempt to workaround horrible mingw/gcc destructor bug on Windows and Mac, which passes garbage in the this pointer
+// we use the opportunity of control here to clean up all of our tls variables. we could keep a list, but this is a safe,
+// functional hack
 thread_specific_ptr::~thread_specific_ptr() {
     if (verusclhasher_key.ptr)
     {
@@ -44,20 +55,107 @@ thread_specific_ptr::~thread_specific_ptr() {
         verusclhasher_descr.reset();
     }
 }
-#endif
+#endif // defined(__APPLE__) || defined(_WIN32)
+#if defined(__arm__)  || defined(__aarch64__) //intrinsics not defined in SSE2NEON.h
+
+static inline __attribute__((always_inline)) __m128i _mm_set_epi64x(uint64_t hi, uint64_t lo)
+	{
+	__m128i result;
+	((uint64_t *)&result)[0] = lo;
+	((uint64_t *)&result)[1] = hi;
+	return result;
+	}
+
+static inline __attribute__((always_inline))  __m128i _mm_mulhrs_epi16(__m128i _a, __m128i _b)
+{
+	int16_t result[8];
+	int16_t *a = (int16_t*)&_a, *b = (int16_t*)&_b;
+	for (int i = 0; i < 8; i++)
+	{
+		result[i] = (int16_t)((((int32_t)(a[i]) * (int32_t)(b[i])) + 0x4000) >> 15);
+	}
+
+	return *(__m128i *)result;
+}
 
-int __cpuverusoptimized = 0x80;
+__m128i _mm_cvtsi64_si128(uint64_t lo)
+{
+	__m128i result;
+	((uint64_t *)&result)[0] = lo;
+	((uint64_t *)&result)[1] = 0;
+	return result;
+}
+
+ static inline __attribute__((always_inline)) uint8x16_t _mm_aesenc_si128 (uint8x16_t a, uint8x16_t RoundKey)
+{
+    return vaesmcq_u8(vaeseq_u8(a, (uint8x16_t){})) ^ RoundKey;
+}
+
+
+ static inline __attribute__((always_inline))  __m128i _mm_clmulepi64_si128(const __m128i a, const __m128i &b, int imm)
+{
+ return  (__m128i)vmull_p64(vgetq_lane_u64(a, 1), vgetq_lane_u64(b,0)); 
+
+}
+
+__m128i _mm_setr_epi8(u_char c0, u_char c1, u_char c2, u_char c3, u_char c4, u_char c5, u_char c6, u_char c7, u_char c8, u_char c9, u_char c10, u_char c11, u_char c12, u_char c13, u_char c14, u_char c15)
+{
+	__m128i result;
+	((uint8_t *)&result)[0] = c0;
+	((uint8_t *)&result)[1] = c1;
+	((uint8_t *)&result)[2] = c2;
+	((uint8_t *)&result)[3] = c3;
+	((uint8_t *)&result)[4] = c4;
+	((uint8_t *)&result)[5] = c5;
+	((uint8_t *)&result)[6] = c6;
+	((uint8_t *)&result)[7] = c7;
+	((uint8_t *)&result)[8] = c8;
+	((uint8_t *)&result)[9] = c9;
+	((uint8_t *)&result)[10] = c10;
+	((uint8_t *)&result)[11] = c11;
+	((uint8_t *)&result)[12] = c12;
+	((uint8_t *)&result)[13] = c13;
+	((uint8_t *)&result)[14] = c14;
+	((uint8_t *)&result)[15] = c15;
+		return result;
+}
+__m128i _mm_shuffle_epi8(__m128i a, __m128i b)
+{
+	__m128i result;
+	for (int i = 0; i < 16; i++)
+	{
+		if (((uint8_t *)&b)[i] & 0x80)
+		{
+			((uint8_t *)&result)[i] = 0;
+		}
+		else
+		{
+			((uint8_t *)&result)[i] = ((uint8_t *)&a)[((uint8_t *)&b)[i] & 0xf];
+		}
+	}
+	return result;
+}
+ int64_t _mm_cvtsi128_si64(__m128i a)
+{
+	return ((int64_t *)&a)[0];
+}
+__m128i _mm_loadl_epi64(__m128i *a)
+{
+	__m128i b = {0}; ((uint64_t*)&b)[0] = ((uint64_t*)a)[0];
+	return b;
+}
+#endif 
 
 // multiply the length and the some key, no modulo
-static inline __m128i lazyLengthHash(uint64_t keylength, uint64_t length) {
+    static inline __attribute__((always_inline)) __m128i lazyLengthHash(uint64_t keylength, uint64_t length) {
+
     const __m128i lengthvector = _mm_set_epi64x(keylength,length);
     const __m128i clprod1 = _mm_clmulepi64_si128( lengthvector, lengthvector, 0x10);
     return clprod1;
 }
 
 // modulo reduction to 64-bit value. The high 64 bits contain garbage, see precompReduction64
-static inline __m128i precompReduction64_si128( __m128i A) {
-
+  static inline __attribute__((always_inline)) __m128i precompReduction64_si128( __m128i A) {
     //const __m128i C = _mm_set_epi64x(1U,(1U<<4)+(1U<<3)+(1U<<1)+(1U<<0)); // C is the irreducible poly. (64,4,3,1,0)
     const __m128i C = _mm_cvtsi64_si128((1U<<4)+(1U<<3)+(1U<<1)+(1U<<0));
     __m128i Q2 = _mm_clmulepi64_si128( A, C, 0x01);
@@ -68,12 +166,52 @@ static inline __m128i precompReduction64_si128( __m128i A) {
     return final;/// WARNING: HIGH 64 BITS CONTAIN GARBAGE
 }
 
-static inline uint64_t precompReduction64( __m128i A) {
+    static inline __attribute__((always_inline)) uint64_t precompReduction64( __m128i A) {
     return _mm_cvtsi128_si64(precompReduction64_si128(A));
 }
 
+    static inline __attribute__((always_inline)) void fixupkey(__m128i **pMoveScratch, verusclhash_descr *pdesc) {
+    uint32_t ofs = pdesc->keySizeInBytes >> 4;
+    for (__m128i *pfixup = *pMoveScratch; pfixup; pfixup = *++pMoveScratch)
+    {
+        const __m128i fixup = _mm_load_si128((__m128i *)(pfixup + ofs));
+        _mm_store_si128((__m128i *)pfixup, fixup);
+    }
+}
+
+    static inline __attribute__((always_inline)) void haraka512_keyed_local(unsigned char *out, const unsigned char *in, const u128 *rc) {
+  u128 s[4], tmp;
+
+  s[0] = LOAD(in);
+  s[1] = LOAD(in + 16);
+  s[2] = LOAD(in + 32);
+  s[3] = LOAD(in + 48);
+
+  AES4(s[0], s[1], s[2], s[3], 0);
+  MIX4(s[0], s[1], s[2], s[3]);
+
+  AES4(s[0], s[1], s[2], s[3], 8);
+  MIX4(s[0], s[1], s[2], s[3]);
+
+  AES4(s[0], s[1], s[2], s[3], 16);
+  MIX4(s[0], s[1], s[2], s[3]);
+
+  AES4(s[0], s[1], s[2], s[3], 24);
+  MIX4(s[0], s[1], s[2], s[3]);
+
+  AES4(s[0], s[1], s[2], s[3], 32);
+  MIX4(s[0], s[1], s[2], s[3]);
+
+  s[0] = _mm_xor_si128(s[0], LOAD(in));
+  s[1] = _mm_xor_si128(s[1], LOAD(in + 16));
+  s[2] = _mm_xor_si128(s[2], LOAD(in + 32));
+  s[3] = _mm_xor_si128(s[3], LOAD(in + 48));
+
+  TRUNCSTORE(out, s[0], s[1], s[2], s[3]);
+}
+
 // verus intermediate hash extra
-static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask)
+__m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask, __m128i **pMoveScratch)
 {
     __m128i const *pbuf;
 
@@ -93,6 +231,9 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource
         __m128i *prand = randomsource + ((selector >> 5) & keyMask);
         __m128i *prandex = randomsource + ((selector >> 32) & keyMask);
 
+        *(pMoveScratch++) = prand;
+        *(pMoveScratch++) = prandex;        
+
         // select random start and order of pbuf processing
         pbuf = buf + (selector & 3);
 
@@ -329,20 +470,306 @@ static __m128i __verusclmulwithoutreduction64alignedrepeat(__m128i *randomsource
 
 // hashes 64 bytes only by doing a carryless multiplication and reduction of the repeated 64 byte sequence 16 times, 
 // returning a 64 bit hash value
-uint64_t verusclhash(void * random, const unsigned char buf[64], uint64_t keyMask) {
-    __m128i  acc = __verusclmulwithoutreduction64alignedrepeat((__m128i *)random, (const __m128i *)buf, keyMask);
+uint64_t verusclhash(void * random, const unsigned char buf[64], uint64_t keyMask, __m128i **pMoveScratch) {
+    __m128i  acc = __verusclmulwithoutreduction64alignedrepeat((__m128i *)random, (const __m128i *)buf, keyMask, pMoveScratch);
     acc = _mm_xor_si128(acc, lazyLengthHash(1024, 64));
     return precompReduction64(acc);
 }
 
-#ifdef __WIN32
-#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno)
-#endif
+// hashes 64 bytes only by doing a carryless multiplication and reduction of the repeated 64 byte sequence 16 times, 
+// returning a 64 bit hash value
+uint64_t verusclhash_sv2_1(void * random, const unsigned char buf[64], uint64_t keyMask, __m128i **pMoveScratch) {
+    __m128i acc = __verusclmulwithoutreduction64alignedrepeat_sv2_1((__m128i *)random, (const __m128i *)buf, keyMask, pMoveScratch);
+    acc = _mm_xor_si128(acc, lazyLengthHash(1024, 64));
+    return precompReduction64(acc);
+}
+
+__m128i __verusclmulwithoutreduction64alignedrepeat_sv2_1(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask, __m128i **pMoveScratch)
+{
+    const __m128i pbuf_copy[4] = {_mm_xor_si128(buf[0], buf[2]), _mm_xor_si128(buf[1], buf[3]), buf[2], buf[3]};
+    const __m128i *pbuf; 
+
+    // divide key mask by 16 from bytes to __m128i
+    keyMask >>= 4;
+
+    // the random buffer must have at least 32 16 byte dwords after the keymask to work with this
+    // algorithm. we take the value from the last element inside the keyMask + 2, as that will never
+    // be used to xor into the accumulator before it is hashed with other values first
+    __m128i acc = _mm_load_si128(randomsource + (keyMask + 2));
+
+    for (int64_t i = 0; i < 32; i++)
+    {
+        const uint64_t selector = _mm_cvtsi128_si64(acc);
+
+        // get two random locations in the key, which will be mutated and swapped
+        __m128i *prand = randomsource + ((selector >> 5) & keyMask);
+        __m128i *prandex = randomsource + ((selector >> 32) & keyMask);
+
+        *(pMoveScratch++) = prand;
+        *(pMoveScratch++) = prandex;        
+
+        // select random start and order of pbuf processing
+        pbuf = pbuf_copy + (selector & 3);
+
+        switch (selector & 0x1c)
+        {
+            case 0:
+            {
+                const __m128i temp1 = _mm_load_si128(prandex);
+                const __m128i temp2 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
+                const __m128i add1 = _mm_xor_si128(temp1, temp2);
+                const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
+                acc = _mm_xor_si128(clprod1, acc);
+
+                const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
+                const __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
+
+                const __m128i temp12 = _mm_load_si128(prand);
+                _mm_store_si128(prand, tempa2);
+
+                const __m128i temp22 = _mm_load_si128(pbuf);
+                const __m128i add12 = _mm_xor_si128(temp12, temp22);
+                const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10);
+                acc = _mm_xor_si128(clprod12, acc);
+
+                const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
+                const __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
+                _mm_store_si128(prandex, tempb2);
+                break;
+            }
+            case 4:
+            {
+                const __m128i temp1 = _mm_load_si128(prand);
+                const __m128i temp2 = _mm_load_si128(pbuf);
+                const __m128i add1 = _mm_xor_si128(temp1, temp2);
+                const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
+                acc = _mm_xor_si128(clprod1, acc);
+                const __m128i clprod2 = _mm_clmulepi64_si128(temp2, temp2, 0x10);
+                acc = _mm_xor_si128(clprod2, acc);
+
+                const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
+                const __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
+
+                const __m128i temp12 = _mm_load_si128(prandex);
+                _mm_store_si128(prandex, tempa2);
+
+                const __m128i temp22 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
+                const __m128i add12 = _mm_xor_si128(temp12, temp22);
+                acc = _mm_xor_si128(add12, acc);
+
+                const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
+                const __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
+                _mm_store_si128(prand, tempb2);
+                break;
+            }
+            case 8:
+            {
+                const __m128i temp1 = _mm_load_si128(prandex);
+                const __m128i temp2 = _mm_load_si128(pbuf);
+                const __m128i add1 = _mm_xor_si128(temp1, temp2);
+                acc = _mm_xor_si128(add1, acc);
+
+                const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
+                const __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
+
+                const __m128i temp12 = _mm_load_si128(prand);
+                _mm_store_si128(prand, tempa2);
+
+                const __m128i temp22 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
+                const __m128i add12 = _mm_xor_si128(temp12, temp22);
+                const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10);
+                acc = _mm_xor_si128(clprod12, acc);
+                const __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10);
+                acc = _mm_xor_si128(clprod22, acc);
+
+                const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
+                const __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
+                _mm_store_si128(prandex, tempb2);
+                break;
+            }
+            case 0xc:
+            {
+                const __m128i temp1 = _mm_load_si128(prand);
+                const __m128i temp2 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
+                const __m128i add1 = _mm_xor_si128(temp1, temp2);
+
+                // cannot be zero here
+                const int32_t divisor = (uint32_t)selector;
+
+                acc = _mm_xor_si128(add1, acc);
+
+                const int64_t dividend = _mm_cvtsi128_si64(acc);
+                const __m128i modulo = _mm_cvtsi32_si128(dividend % divisor);
+                acc = _mm_xor_si128(modulo, acc);
+
+                const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
+                const __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
+
+                if (dividend & 1)
+                {
+                    const __m128i temp12 = _mm_load_si128(prandex);
+                    _mm_store_si128(prandex, tempa2);
+
+                    const __m128i temp22 = _mm_load_si128(pbuf);
+                    const __m128i add12 = _mm_xor_si128(temp12, temp22);
+                    const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10);
+                    acc = _mm_xor_si128(clprod12, acc);
+                    const __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10);
+                    acc = _mm_xor_si128(clprod22, acc);
+
+                    const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
+                    const __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
+                    _mm_store_si128(prand, tempb2);
+                }
+                else
+                {
+                    const __m128i tempb3 = _mm_load_si128(prandex);
+                    _mm_store_si128(prandex, tempa2);
+                    _mm_store_si128(prand, tempb3);
+                }
+                break;
+            }
+            case 0x10:
+            {
+                // a few AES operations
+                const __m128i *rc = prand;
+                __m128i tmp;
+
+                __m128i temp1 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
+                __m128i temp2 = _mm_load_si128(pbuf);
+
+                AES2(temp1, temp2, 0);
+                MIX2(temp1, temp2);
+
+                AES2(temp1, temp2, 4);
+                MIX2(temp1, temp2);
+
+                AES2(temp1, temp2, 8);
+                MIX2(temp1, temp2);
+
+                acc = _mm_xor_si128(temp2, _mm_xor_si128(temp1, acc));
+
+                const __m128i tempa1 = _mm_load_si128(prand);
+                const __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1);
+                const __m128i tempa3 = _mm_xor_si128(tempa1, tempa2);
+
+                const __m128i tempa4 = _mm_load_si128(prandex);
+                _mm_store_si128(prandex, tempa3);
+                _mm_store_si128(prand, tempa4);
+                break;
+            }
+            case 0x14:
+            {
+                // we'll just call this one the monkins loop, inspired by Chris - modified to cast to uint64_t on shift for more variability in the loop
+                const __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1);
+                __m128i tmp; // used by MIX2
+
+                uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times
+                __m128i *rc = prand;
+                uint64_t aesroundoffset = 0;
+                __m128i onekey;
+
+                do
+                {
+                    if (selector & (((uint64_t)0x10000000) << rounds))
+                    {
+                        onekey = _mm_load_si128(rc++);
+                        const __m128i temp2 = _mm_load_si128(rounds & 1 ? pbuf : buftmp);
+                        const __m128i add1 = _mm_xor_si128(onekey, temp2);
+                        const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
+                        acc = _mm_xor_si128(clprod1, acc);
+                    }
+                    else
+                    {
+                        onekey = _mm_load_si128(rc++);
+                        __m128i temp2 = _mm_load_si128(rounds & 1 ? buftmp : pbuf);
+                        AES2(onekey, temp2, aesroundoffset);
+                        aesroundoffset += 4;
+                        MIX2(onekey, temp2);
+                        acc = _mm_xor_si128(onekey, acc);
+                        acc = _mm_xor_si128(temp2, acc);
+                    }
+                } while (rounds--);
+
+                const __m128i tempa1 = _mm_load_si128(prand);
+                const __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1);
+                const __m128i tempa3 = _mm_xor_si128(tempa1, tempa2);
+
+                const __m128i tempa4 = _mm_load_si128(prandex);
+                _mm_store_si128(prandex, tempa3);
+                _mm_store_si128(prand, tempa4);
+                break;
+            }
+            case 0x18:
+            {
+                const __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1);
+                __m128i tmp; // used by MIX2
+
+                uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times
+                __m128i *rc = prand;
+                uint64_t aesroundoffset = 0;
+                __m128i onekey;
+
+                do
+                {
+                    if (selector & (((uint64_t)0x10000000) << rounds))
+                    {
+                        onekey = _mm_load_si128(rc++);
+                        const __m128i temp2 = _mm_load_si128(rounds & 1 ? pbuf : buftmp);
+                        const __m128i add1 = _mm_xor_si128(onekey, temp2);
+                        // cannot be zero here, may be negative
+                        const int32_t divisor = (uint32_t)selector;
+                        const int64_t dividend = _mm_cvtsi128_si64(add1);
+                        const __m128i modulo = _mm_cvtsi32_si128(dividend % divisor);
+                        acc = _mm_xor_si128(modulo, acc);
+                    }
+                    else
+                    {
+                        onekey = _mm_load_si128(rc++);
+                        __m128i temp2 = _mm_load_si128(rounds & 1 ? buftmp : pbuf);
+                        const __m128i add1 = _mm_xor_si128(onekey, temp2);
+                        const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
+                        const __m128i clprod2 = _mm_mulhrs_epi16(acc, clprod1);
+                        acc = _mm_xor_si128(clprod2, acc);
+                    }
+                } while (rounds--);
+
+                const __m128i tempa3 = _mm_load_si128(prandex);
+                const __m128i tempa4 = _mm_xor_si128(tempa3, acc);
+                _mm_store_si128(prandex, tempa4);
+                _mm_store_si128(prand, onekey);
+                break;
+            }
+            case 0x1c:
+            {
+                const __m128i temp1 = _mm_load_si128(pbuf);
+                const __m128i temp2 = _mm_load_si128(prandex);
+                const __m128i add1 = _mm_xor_si128(temp1, temp2);
+                const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
+                acc = _mm_xor_si128(clprod1, acc);
+
+                const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp2);
+                const __m128i tempa2 = _mm_xor_si128(tempa1, temp2);
+
+                const __m128i tempa3 = _mm_load_si128(prand);
+                _mm_store_si128(prand, tempa2);
+
+                acc = _mm_xor_si128(tempa3, acc);
+
+                const __m128i tempb1 = _mm_mulhrs_epi16(acc, tempa3);
+                const __m128i tempb2 = _mm_xor_si128(tempb1, tempa3);
+                _mm_store_si128(prandex, tempb2);
+                break;
+            }
+        }
+    }
+    return acc;
+}
 
 void *alloc_aligned_buffer(uint64_t bufSize)
 {
     void *answer = NULL;
-    if (posix_memalign(&answer, sizeof(__m256i), bufSize))
+    if (posix_memalign(&answer, sizeof(__m128i)*2, bufSize))
     {
         return NULL;
     }