AstroBWT algorithm (DERO) support

To test: - Download https://github.com/deroproject/derosuite/releases/tag/AstroBWT - Run daemon with `--testnet` in command line In config.json: - "coin":"dero" - "url":"127.0.0.1:30306" - "daemon:"true"
4 years ago · 14ef99ca67
29 changed files with 2316 additions and 11 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -9,6 +9,7 @@ option(WITH_CN_PICO         "Enable CryptoNight-Pico algorithm" ON)
 option(WITH_CN_GPU          "Enable CryptoNight-GPU algorithm" ON)
 option(WITH_RANDOMX         "Enable RandomX algorithms family" ON)
 option(WITH_ARGON2          "Enable Argon2 algorithms family" ON)
+option(WITH_ASTROBWT        "Enable AstroBWT algorithms family" ON)
 option(WITH_HTTP            "Enable HTTP protocol support (client/server)" ON)
 option(WITH_DEBUG_LOG       "Enable debug log output" OFF)
 option(WITH_TLS             "Enable OpenSSL support" ON)
@ -176,6 +177,7 @@ find_package(UV REQUIRED)
 include(cmake/flags.cmake)
 include(cmake/randomx.cmake)
 include(cmake/argon2.cmake)
+include(cmake/astrobwt.cmake)
 include(cmake/OpenSSL.cmake)
 include(cmake/asm.cmake)
 include(cmake/cn-gpu.cmake)
--- a/cmake/astrobwt.cmake
+++ b/cmake/astrobwt.cmake
@ -0,0 +1,36 @@
+if (WITH_ASTROBWT)
+    add_definitions(/DXMRIG_ALGO_ASTROBWT)
+
+    list(APPEND HEADERS_CRYPTO
+        src/crypto/astrobwt/AstroBWT.h
+        src/crypto/astrobwt/sha3.h
+    )
+
+    list(APPEND SOURCES_CRYPTO
+        src/crypto/astrobwt/AstroBWT.cpp
+        src/crypto/astrobwt/sha3.cpp
+    )
+
+    if (XMRIG_ARM)
+        list(APPEND HEADERS_CRYPTO
+            src/crypto/astrobwt/salsa20_ref/ecrypt-config.h
+            src/crypto/astrobwt/salsa20_ref/ecrypt-machine.h
+            src/crypto/astrobwt/salsa20_ref/ecrypt-portable.h
+            src/crypto/astrobwt/salsa20_ref/ecrypt-sync.h
+        )
+
+        list(APPEND SOURCES_CRYPTO
+            src/crypto/astrobwt/salsa20_ref/salsa20.c
+        )
+    else()
+        list(APPEND HEADERS_CRYPTO
+            src/crypto/astrobwt/Salsa20.hpp
+        )
+
+        list(APPEND SOURCES_CRYPTO
+            src/crypto/astrobwt/Salsa20.cpp
+        )
+    endif()
+else()
+    remove_definitions(/DXMRIG_ALGO_ASTROBWT)
+endif()
--- a/src/backend/cpu/CpuConfig.cpp
+++ b/src/backend/cpu/CpuConfig.cpp
@ -165,6 +165,7 @@ void xmrig::CpuConfig::generate()
    count += xmrig::generate<Algorithm::CN_PICO>(m_threads, m_limit);
    count += xmrig::generate<Algorithm::RANDOM_X>(m_threads, m_limit);
    count += xmrig::generate<Algorithm::ARGON2>(m_threads, m_limit);
+    count += xmrig::generate<Algorithm::ASTROBWT>(m_threads, m_limit);

    m_shouldSave = count > 0;
 }
--- a/src/backend/cpu/CpuConfig_gen.h
+++ b/src/backend/cpu/CpuConfig_gen.h
@ -143,6 +143,14 @@ size_t inline generate<Algorithm::ARGON2>(Threads<CpuThreads> &threads, uint32_t
 #endif


+#ifdef XMRIG_ALGO_ASTROBWT
+template<>
+size_t inline generate<Algorithm::ASTROBWT>(Threads<CpuThreads>& threads, uint32_t limit)
+{
+    return generate("astrobwt", threads, Algorithm::ASTROBWT_DERO, limit);
+}
+#endif
+
 } /* namespace xmrig */


--- a/src/backend/cpu/CpuWorker.cpp
+++ b/src/backend/cpu/CpuWorker.cpp
@ -44,6 +44,11 @@
 #endif


+#ifdef XMRIG_ALGO_ASTROBWT
+#   include "crypto/astrobwt/AstroBWT.h"
+#endif
+
+
 namespace xmrig {

 static constexpr uint32_t kReserveCount = 32768;
@ -180,6 +185,12 @@ bool xmrig::CpuWorker<N>::selfTest()
    }
 #   endif

+#   ifdef XMRIG_ALGO_ASTROBWT
+    if (m_algorithm.family() == Algorithm::ASTROBWT) {
+        return verify(Algorithm::ASTROBWT_DERO, astrobwt_dero_test_out);
+    }
+#   endif
+
    return false;
 }

--- a/src/backend/cpu/platform/AdvancedCpuInfo.cpp
+++ b/src/backend/cpu/platform/AdvancedCpuInfo.cpp
@ -172,6 +172,17 @@ xmrig::CpuThreads xmrig::AdvancedCpuInfo::threads(const Algorithm &algorithm, ui
    size_t cache = 0;
    size_t count = 0;

+#   ifdef XMRIG_ALGO_ASTROBWT
+    if (algorithm == Algorithm::ASTROBWT_DERO) {
+        CpuThreads t;
+        count = threads();
+        for (size_t i = 0; i < count; ++i) {
+            t.add(i, 0);
+        }
+        return t;
+    }
+#   endif
+
    if (m_L3) {
        cache = m_L2_exclusive ? (m_L2 + m_L3) : m_L3;
    }
--- a/src/backend/cpu/platform/BasicCpuInfo.cpp
+++ b/src/backend/cpu/platform/BasicCpuInfo.cpp
@ -258,5 +258,15 @@ xmrig::CpuThreads xmrig::BasicCpuInfo::threads(const Algorithm &algorithm, uint3
    }
 #   endif

+#   ifdef XMRIG_ALGO_ASTROBWT
+    if (algorithm.family() == Algorithm::ASTROBWT) {
+        CpuThreads threads;
+        for (size_t i = 0; i < count; ++i) {
+            threads.add(i, 0);
+        }
+        return threads;
+    }
+#   endif
+
    return CpuThreads(std::max<size_t>(count / 2, 1), 1);
 }
--- a/src/backend/cpu/platform/HwlocCpuInfo.cpp
+++ b/src/backend/cpu/platform/HwlocCpuInfo.cpp
@ -216,6 +216,12 @@ bool xmrig::HwlocCpuInfo::membind(hwloc_const_bitmap_t nodeset)

 xmrig::CpuThreads xmrig::HwlocCpuInfo::threads(const Algorithm &algorithm, uint32_t limit) const
 {
+#   ifdef XMRIG_ALGO_ASTROBWT
+    if (algorithm == Algorithm::ASTROBWT_DERO) {
+        return BasicCpuInfo::threads(algorithm, limit);
+    }
+#   endif
+
    if (L2() == 0 && L3() == 0) {
        return BasicCpuInfo::threads(algorithm, limit);
    }
--- a/src/backend/opencl/cl/cn/algorithm.cl
+++ b/src/backend/opencl/cl/cn/algorithm.cl
@ -24,6 +24,7 @@
 #define ALGO_RX_SFX         23
 #define ALGO_AR2_CHUKWA     24
 #define ALGO_AR2_WRKZ       25
+#define ALGO_ASTROBWT_DERO  26

 #define FAMILY_UNKNOWN      0
 #define FAMILY_CN           1
@ -32,3 +33,4 @@
 #define FAMILY_CN_PICO      4
 #define FAMILY_RANDOM_X     5
 #define FAMILY_ARGON2       6
+#define FAMILY_ASTROBWT     7
--- a/src/base/net/http/HttpClient.cpp
+++ b/src/base/net/http/HttpClient.cpp
@ -140,6 +140,7 @@ void xmrig::HttpClient::handshake()

    if (!body.empty()) {
        headers.insert({ "Content-Length", std::to_string(body.size()) });
+        headers.insert({ "Content-Type", "application/json" });
    }

    std::stringstream ss;
--- a/src/base/net/stratum/DaemonClient.cpp
+++ b/src/base/net/stratum/DaemonClient.cpp
@ -59,12 +59,14 @@ static const char *kHash                    = "hash";
 static const char *kHeight                  = "height";
 static const char *kJsonRPC                 = "/json_rpc";

+static const size_t BlobReserveSize         = 8;
+
 }


 xmrig::DaemonClient::DaemonClient(int id, IClientListener *listener) :
    BaseClient(id, listener),
-    m_monero(true)
+    m_apiVersion(API_MONERO)
 {
    m_httpListener  = std::make_shared<HttpListener>(this);
    m_timer         = new Timer(this);
@ -106,14 +108,21 @@ int64_t xmrig::DaemonClient::submit(const JobResult &result)
 #   ifdef XMRIG_PROXY_PROJECT
    memcpy(m_blocktemplate.data() + 78, result.nonce, 8);
 #   else
-    Buffer::toHex(reinterpret_cast<const uint8_t *>(&result.nonce), 4, m_blocktemplate.data() + 78);
+    char* data = (m_apiVersion == API_DERO) ? m_blockhashingblob.data() : m_blocktemplate.data();
+    Buffer::toHex(reinterpret_cast<const uint8_t *>(&result.nonce), 4, data + 78);
 #   endif

    using namespace rapidjson;
    Document doc(kObjectType);

    Value params(kArrayType);
-    params.PushBack(m_blocktemplate.toJSON(), doc.GetAllocator());
+    if (m_apiVersion == API_DERO) {
+        params.PushBack(m_blocktemplate.toJSON(), doc.GetAllocator());
+        params.PushBack(m_blockhashingblob.toJSON(), doc.GetAllocator());
+    }
+    else {
+        params.PushBack(m_blocktemplate.toJSON(), doc.GetAllocator());
+    }

    JsonRequest::create(doc, m_sequence, "submitblock", params);

@ -131,6 +140,10 @@ int64_t xmrig::DaemonClient::submit(const JobResult &result)

 void xmrig::DaemonClient::connect()
 {
+    if ((m_pool.algorithm() == Algorithm::ASTROBWT_DERO) || (m_pool.coin() == Coin::DERO)) {
+        m_apiVersion = API_DERO;
+    }
+
    setState(ConnectingState);
    getBlockTemplate();
 }
@ -172,7 +185,7 @@ void xmrig::DaemonClient::onHttpData(const HttpData &data)
    if (data.method == HTTP_GET) {
        if (data.url == kGetHeight) {
            if (!doc.HasMember(kHash)) {
-                m_monero = false;
+                m_apiVersion = API_CRYPTONOTE_DEFAULT;

                return send(HTTP_GET, kGetInfo);
            }
@ -200,7 +213,21 @@ void xmrig::DaemonClient::onTimer(const Timer *)
        getBlockTemplate();
    }
    else if (m_state == ConnectedState) {
-        send(HTTP_GET, m_monero ? kGetHeight : kGetInfo);
+        if (m_apiVersion == API_DERO) {
+            using namespace rapidjson;
+            Document doc(kObjectType);
+            auto& allocator = doc.GetAllocator();
+
+            doc.AddMember("id", m_sequence, allocator);
+            doc.AddMember("jsonrpc", "2.0", allocator);
+            doc.AddMember("method", "get_info", allocator);
+
+            send(HTTP_POST, kJsonRPC, doc);
+            ++m_sequence;
+        }
+        else {
+            send(HTTP_GET, (m_apiVersion == API_MONERO) ? kGetHeight : kGetInfo);
+        }
    }
 }

@ -216,7 +243,14 @@ bool xmrig::DaemonClient::parseJob(const rapidjson::Value &params, int *code)
    Job job(false, m_pool.algorithm(), String());

    String blocktemplate = Json::getString(params, kBlocktemplateBlob);
-    if (blocktemplate.isNull() || !job.setBlob(Json::getString(params, "blockhashing_blob"))) {
+
+    m_blockhashingblob = Json::getString(params, "blockhashing_blob");
+    if (m_apiVersion == API_DERO) {
+        const uint64_t offset = Json::getUint64(params, "reserved_offset");
+        Buffer::toHex(Buffer::randomBytes(BlobReserveSize).data(), BlobReserveSize, m_blockhashingblob.data() + offset * 2);
+    }
+
+    if (blocktemplate.isNull() || !job.setBlob(m_blockhashingblob)) {
        *code = 4;
        return false;
    }
@ -263,6 +297,13 @@ bool xmrig::DaemonClient::parseResponse(int64_t id, const rapidjson::Value &resu
        return false;
    }

+    if (result.HasMember("top_block_hash")) {
+        if (m_prevHash != Json::getString(result, "top_block_hash")) {
+            getBlockTemplate();
+        }
+        return true;
+    }
+
    int code = -1;
    if (result.HasMember(kBlocktemplateBlob) && parseJob(result, &code)) {
        return true;
@ -286,7 +327,12 @@ int64_t xmrig::DaemonClient::getBlockTemplate()

    Value params(kObjectType);
    params.AddMember("wallet_address", m_user.toJSON(), allocator);
-    params.AddMember("extra_nonce",    Buffer::randomBytes(8).toHex().toJSON(doc), allocator);
+    if (m_apiVersion == API_DERO) {
+        params.AddMember("reserve_size", BlobReserveSize, allocator);
+    }
+    else {
+        params.AddMember("extra_nonce", Buffer::randomBytes(BlobReserveSize).toHex().toJSON(doc), allocator);
+    }

    JsonRequest::create(doc, m_sequence, "getblocktemplate", params);

--- a/src/base/net/stratum/DaemonClient.h
+++ b/src/base/net/stratum/DaemonClient.h
@ -76,9 +76,15 @@ private:
    void send(int method, const char *url, const rapidjson::Document &doc);
    void setState(SocketState state);

-    bool m_monero;
+    enum {
+        API_CRYPTONOTE_DEFAULT,
+        API_MONERO,
+        API_DERO,
+    } m_apiVersion;
+
    std::shared_ptr<IHttpListener> m_httpListener;
    String m_blocktemplate;
+    String m_blockhashingblob;
    String m_prevHash;
    String m_tlsFingerprint;
    String m_tlsVersion;
--- a/src/crypto/astrobwt/AstroBWT.cpp
+++ b/src/crypto/astrobwt/AstroBWT.cpp
@ -0,0 +1,207 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik              <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler                   <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones              <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466                 <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee                <jayddee246@gmail.com>
+ * Copyright 2017-2019 XMR-Stak                 <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett              <https://github.com/vtnerd>
+ * Copyright 2018-2019 tevador                  <tevador@gmail.com>
+ * Copyright 2000      Transmeta Corporation    <https://github.com/intel/msr-tools>
+ * Copyright 2004-2008 H. Peter Anvin           <https://github.com/intel/msr-tools>
+ * Copyright 2018-2020 SChernykh                <https://github.com/SChernykh>
+ * Copyright 2016-2020 XMRig                    <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "AstroBWT.h"
+#include "sha3.h"
+#include "crypto/cn/CryptoNight.h"
+
+constexpr int STAGE1_SIZE = 147253;
+constexpr int ALLOCATION_SIZE = (STAGE1_SIZE + 1048576) + (128 - (STAGE1_SIZE & 63));
+
+constexpr int COUNTING_SORT_BITS = 10;
+constexpr int COUNTING_SORT_SIZE = 1 << COUNTING_SORT_BITS;
+
+#ifdef _MSC_VER
+
+#include <stdlib.h>
+#define bswap_64(x) _byteswap_uint64(x)
+
+#elif defined __GNUC__
+
+#define bswap_64(x) __builtin_bswap64(x)
+
+#else
+
+#include <byteswap.h>
+
+#endif
+
+#ifdef XMRIG_ARM
+extern "C" {
+#include "salsa20_ref/ecrypt-sync.h"
+}
+
+static void Salsa20_XORKeyStream(const void* key, void* output, size_t size)
+{
+	uint8_t iv[8] = {};
+	ECRYPT_ctx ctx;
+	ECRYPT_keysetup(&ctx, static_cast<const uint8_t*>(key), 256, 64);
+	ECRYPT_ivsetup(&ctx, iv);
+	ECRYPT_keystream_bytes(&ctx, static_cast<uint8_t*>(output), size);
+	memset(static_cast<uint8_t*>(output) + size, 0, 16);
+}
+#else
+#include "Salsa20.hpp"
+
+static void Salsa20_XORKeyStream(const void* key, void* output, size_t size)
+{
+	const uint64_t iv = 0;
+	ZeroTier::Salsa20 s(key, &iv);
+	s.XORKeyStream(output, size);
+	memset(static_cast<uint8_t*>(output) + size, 0, 16);
+}
+#endif
+
+void sort_indices(int N, const uint8_t* v, uint64_t* indices, uint64_t* tmp_indices)
+{
+	uint32_t counters[2][COUNTING_SORT_SIZE] = {};
+
+	for (int i = 0; i < N; ++i)
+	{
+		const uint64_t k = bswap_64(*reinterpret_cast<const uint64_t*>(v + i));
+		++counters[0][(k >> (64 - COUNTING_SORT_BITS * 2)) & (COUNTING_SORT_SIZE - 1)];
+		++counters[1][k >> (64 - COUNTING_SORT_BITS)];
+	}
+
+	uint32_t prev[2] = { counters[0][0], counters[1][0] };
+	counters[0][0] = prev[0] - 1;
+	counters[1][0] = prev[1] - 1;
+	for (int i = 1; i < COUNTING_SORT_SIZE; ++i)
+	{
+		const uint32_t cur[2] = { counters[0][i] + prev[0], counters[1][i] + prev[1] };
+		counters[0][i] = cur[0] - 1;
+		counters[1][i] = cur[1] - 1;
+		prev[0] = cur[0];
+		prev[1] = cur[1];
+	}
+
+	for (int i = N - 1; i >= 0; --i)
+	{
+		const uint64_t k = bswap_64(*reinterpret_cast<const uint64_t*>(v + i));
+		tmp_indices[counters[0][(k >> (64 - COUNTING_SORT_BITS * 2)) & (COUNTING_SORT_SIZE - 1)]--] = (k & (static_cast<uint64_t>(-1) << 21)) | i;
+	}
+
+	for (int i = N - 1; i >= 0; --i)
+	{
+		const uint64_t data = tmp_indices[i];
+		indices[counters[1][data >> (64 - COUNTING_SORT_BITS)]--] = data;
+	}
+
+	auto smaller = [v](uint64_t a, uint64_t b)
+	{
+		const uint64_t value_a = a >> 21;
+		const uint64_t value_b = b >> 21;
+
+		if (value_a < value_b)
+			return true;
+
+		if (value_a > value_b)
+			return false;
+
+		const uint64_t data_a = bswap_64(*reinterpret_cast<const uint64_t*>(v + (a % (1 << 21)) + 5));
+		const uint64_t data_b = bswap_64(*reinterpret_cast<const uint64_t*>(v + (b % (1 << 21)) + 5));
+		return (data_a < data_b);
+	};
+
+	uint64_t prev_t = indices[0];
+	for (int i = 1; i < N; ++i)
+	{
+		uint64_t t = indices[i];
+		if (smaller(t, prev_t))
+		{
+			const uint64_t t2 = prev_t;
+			int j = i - 1;
+			do
+			{
+				indices[j + 1] = prev_t;
+				--j;
+				if (j < 0)
+					break;
+				prev_t = indices[j];
+			} while (smaller(t, prev_t));
+			indices[j + 1] = t;
+			t = t2;
+		}
+		prev_t = t;
+	}
+}
+
+void astrobwt_dero(const void* input_data, uint32_t input_size, void* scratchpad, uint8_t* output_hash)
+{
+	uint8_t key[32];
+	uint8_t* scratchpad_ptr = (uint8_t*)(scratchpad) + 64;
+	uint8_t* stage1_output = scratchpad_ptr;
+	uint8_t* stage2_output = scratchpad_ptr;
+	uint64_t* indices = (uint64_t*)(scratchpad_ptr + ALLOCATION_SIZE);
+	uint64_t* tmp_indices = (uint64_t*)(scratchpad_ptr + ALLOCATION_SIZE * 9);
+	uint8_t* stage1_result = (uint8_t*)(tmp_indices);
+	uint8_t* stage2_result = (uint8_t*)(tmp_indices);
+
+	sha3_HashBuffer(256, SHA3_FLAGS_NONE, input_data, input_size, key, sizeof(key));
+
+	Salsa20_XORKeyStream(key, stage1_output, STAGE1_SIZE);
+
+	sort_indices(STAGE1_SIZE + 1, stage1_output, indices, tmp_indices);
+
+	{
+		const uint8_t* tmp = stage1_output - 1;
+		for (int i = 0; i <= STAGE1_SIZE; ++i)
+			stage1_result[i] = tmp[indices[i] & ((1 << 21) - 1)];
+	}
+
+	sha3_HashBuffer(256, SHA3_FLAGS_NONE, stage1_result, STAGE1_SIZE + 1, key, sizeof(key));
+
+	const int stage2_size = STAGE1_SIZE + (*(uint32_t*)(key) & 0xfffff);
+	Salsa20_XORKeyStream(key, stage2_output, stage2_size);
+
+	sort_indices(stage2_size + 1, stage2_output, indices, tmp_indices);
+
+	{
+		const uint8_t* tmp = stage2_output - 1;
+		int i = 0;
+		const int n = ((stage2_size + 1) / 4) * 4;
+		for (; i < n; i += 4)
+		{
+			stage2_result[i + 0] = tmp[indices[i + 0] & ((1 << 21) - 1)];
+			stage2_result[i + 1] = tmp[indices[i + 1] & ((1 << 21) - 1)];
+			stage2_result[i + 2] = tmp[indices[i + 2] & ((1 << 21) - 1)];
+			stage2_result[i + 3] = tmp[indices[i + 3] & ((1 << 21) - 1)];
+		}
+		for (; i <= stage2_size; ++i)
+			stage2_result[i] = tmp[indices[i] & ((1 << 21) - 1)];
+	}
+
+	sha3_HashBuffer(256, SHA3_FLAGS_NONE, stage2_result, stage2_size + 1, output_hash, 32);
+}
+
+template<>
+void xmrig::astrobwt::single_hash<xmrig::Algorithm::ASTROBWT_DERO>(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx** ctx, uint64_t)
+{
+	astrobwt_dero(input, static_cast<uint32_t>(size), ctx[0]->memory, output);
+}
--- a/src/crypto/astrobwt/AstroBWT.h
+++ b/src/crypto/astrobwt/AstroBWT.h
@ -0,0 +1,45 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik              <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler                   <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones              <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466                 <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee                <jayddee246@gmail.com>
+ * Copyright 2017-2019 XMR-Stak                 <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018      Lee Clagett              <https://github.com/vtnerd>
+ * Copyright 2018-2019 tevador                  <tevador@gmail.com>
+ * Copyright 2000      Transmeta Corporation    <https://github.com/intel/msr-tools>
+ * Copyright 2004-2008 H. Peter Anvin           <https://github.com/intel/msr-tools>
+ * Copyright 2018-2020 SChernykh                <https://github.com/SChernykh>
+ * Copyright 2016-2020 XMRig                    <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "crypto/common/Algorithm.h"
+
+struct cryptonight_ctx;
+
+
+namespace xmrig { namespace astrobwt {
+
+
+template<Algorithm::Id ALGO>
+void single_hash(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx** ctx, uint64_t);
+
+template<>
+void single_hash<Algorithm::ASTROBWT_DERO>(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx** ctx, uint64_t);
+
+
+}} // namespace xmrig::argon2
--- a/src/crypto/astrobwt/Salsa20.cpp
+++ b/src/crypto/astrobwt/Salsa20.cpp
@ -0,0 +1,352 @@
+/*
+ * Based on public domain code available at: http://cr.yp.to/snuffle.html
+ *
+ * Modifications and C-native SSE macro based SSE implementation by
+ * Adam Ierymenko <adam.ierymenko@zerotier.com>.
+ *
+ * Additional modifications and code cleanup for AstroBWT by
+ * SChernykh <https://github.com/SChernykh>
+ *
+ * Since the original was public domain, this is too.
+ */
+
+#include "Salsa20.hpp"
+
+// Statically compute and define SSE constants
+class _s20sseconsts
+{
+public:
+	_s20sseconsts()
+	{
+		maskLo32 = _mm_shuffle_epi32(_mm_cvtsi32_si128(-1), _MM_SHUFFLE(1, 0, 1, 0));
+		maskHi32 = _mm_slli_epi64(maskLo32, 32);
+	}
+	__m128i maskLo32,maskHi32;
+};
+static const _s20sseconsts _S20SSECONSTANTS;
+
+namespace ZeroTier {
+
+void Salsa20::init(const void *key,const void *iv)
+{
+	const uint32_t *const k = (const uint32_t *)key;
+	_state.i[0] = 0x61707865;
+	_state.i[1] = 0x3320646e;
+	_state.i[2] = 0x79622d32;
+	_state.i[3] = 0x6b206574;
+	_state.i[4] = k[3];
+	_state.i[5] = 0;
+	_state.i[6] = k[7];
+	_state.i[7] = k[2];
+	_state.i[8] = 0;
+	_state.i[9] = k[6];
+	_state.i[10] = k[1];
+	_state.i[11] = ((const uint32_t *)iv)[1];
+	_state.i[12] = k[5];
+	_state.i[13] = k[0];
+	_state.i[14] = ((const uint32_t *)iv)[0];
+	_state.i[15] = k[4];
+}
+
+void Salsa20::XORKeyStream(void *out,unsigned int bytes)
+{
+	uint8_t tmp[64];
+	uint8_t *c = (uint8_t *)out;
+	uint8_t *ctarget = c;
+	unsigned int i;
+
+	if (!bytes)
+		return;
+
+	for (;;) {
+		if (bytes < 64) {
+			for (i = 0;i < bytes;++i)
+				tmp[i] = 0;
+			ctarget = c;
+			c = tmp;
+		}
+
+		__m128i X0 = _mm_loadu_si128((const __m128i *)&(_state.v[0]));
+		__m128i X1 = _mm_loadu_si128((const __m128i *)&(_state.v[1]));
+		__m128i X2 = _mm_loadu_si128((const __m128i *)&(_state.v[2]));
+		__m128i X3 = _mm_loadu_si128((const __m128i *)&(_state.v[3]));
+		__m128i T;
+		__m128i X0s = X0;
+		__m128i X1s = X1;
+		__m128i X2s = X2;
+		__m128i X3s = X3;
+
+		// 2X round -------------------------------------------------------------
+		T = _mm_add_epi32(X0, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X1, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X3, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x93);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x39);
+		T = _mm_add_epi32(X0, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X3, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X1, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x39);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x93);
+
+		// 2X round -------------------------------------------------------------
+		T = _mm_add_epi32(X0, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X1, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X3, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x93);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x39);
+		T = _mm_add_epi32(X0, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X3, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X1, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x39);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x93);
+
+		// 2X round -------------------------------------------------------------
+		T = _mm_add_epi32(X0, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X1, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X3, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x93);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x39);
+		T = _mm_add_epi32(X0, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X3, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X1, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x39);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x93);
+
+		// 2X round -------------------------------------------------------------
+		T = _mm_add_epi32(X0, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X1, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X3, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x93);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x39);
+		T = _mm_add_epi32(X0, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X3, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X1, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x39);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x93);
+
+		// 2X round -------------------------------------------------------------
+		T = _mm_add_epi32(X0, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X1, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X3, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x93);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x39);
+		T = _mm_add_epi32(X0, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X3, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X1, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x39);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x93);
+
+		// 2X round -------------------------------------------------------------
+		T = _mm_add_epi32(X0, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X1, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X3, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x93);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x39);
+		T = _mm_add_epi32(X0, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X3, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X1, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x39);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x93);
+
+		// 2X round -------------------------------------------------------------
+		T = _mm_add_epi32(X0, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X1, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X3, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x93);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x39);
+		T = _mm_add_epi32(X0, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X3, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X1, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x39);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x93);
+
+		// 2X round -------------------------------------------------------------
+		T = _mm_add_epi32(X0, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X1, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X3, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x93);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x39);
+		T = _mm_add_epi32(X0, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X3, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X1, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x39);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x93);
+
+		// 2X round -------------------------------------------------------------
+		T = _mm_add_epi32(X0, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X1, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X3, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x93);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x39);
+		T = _mm_add_epi32(X0, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X3, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X1, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x39);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x93);
+
+		// 2X round -------------------------------------------------------------
+		T = _mm_add_epi32(X0, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X1, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X3, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x93);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x39);
+		T = _mm_add_epi32(X0, X1);
+		X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+		T = _mm_add_epi32(X3, X0);
+		X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+		T = _mm_add_epi32(X2, X3);
+		X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+		T = _mm_add_epi32(X1, X2);
+		X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+		X1 = _mm_shuffle_epi32(X1, 0x39);
+		X2 = _mm_shuffle_epi32(X2, 0x4E);
+		X3 = _mm_shuffle_epi32(X3, 0x93);
+
+		X0 = _mm_add_epi32(X0s,X0);
+		X1 = _mm_add_epi32(X1s,X1);
+		X2 = _mm_add_epi32(X2s,X2);
+		X3 = _mm_add_epi32(X3s,X3);
+
+		__m128i k02 = _mm_shuffle_epi32(_mm_or_si128(_mm_slli_epi64(X0, 32), _mm_srli_epi64(X3, 32)), _MM_SHUFFLE(0, 1, 2, 3));
+		__m128i k13 = _mm_shuffle_epi32(_mm_or_si128(_mm_slli_epi64(X1, 32), _mm_srli_epi64(X0, 32)), _MM_SHUFFLE(0, 1, 2, 3));
+		__m128i k20 = _mm_or_si128(_mm_and_si128(X2, _S20SSECONSTANTS.maskLo32), _mm_and_si128(X1, _S20SSECONSTANTS.maskHi32));
+		__m128i k31 = _mm_or_si128(_mm_and_si128(X3, _S20SSECONSTANTS.maskLo32), _mm_and_si128(X2, _S20SSECONSTANTS.maskHi32));
+		_mm_storeu_ps(reinterpret_cast<float *>(c),_mm_castsi128_ps(_mm_unpackhi_epi64(k02,k20)));
+		_mm_storeu_ps(reinterpret_cast<float *>(c) + 4,_mm_castsi128_ps(_mm_unpackhi_epi64(k13,k31)));
+		_mm_storeu_ps(reinterpret_cast<float *>(c) + 8,_mm_castsi128_ps(_mm_unpacklo_epi64(k20,k02)));
+		_mm_storeu_ps(reinterpret_cast<float *>(c) + 12,_mm_castsi128_ps(_mm_unpacklo_epi64(k31,k13)));
+
+		if (!(++_state.i[8])) {
+			++_state.i[5]; // state reordered for SSE
+			/* stopping at 2^70 bytes per nonce is user's responsibility */
+		}
+
+		if (bytes <= 64) {
+			if (bytes < 64) {
+				for (i = 0;i < bytes;++i)
+					ctarget[i] = c[i];
+			}
+
+			return;
+		}
+
+		bytes -= 64;
+		c += 64;
+	}
+}
+
+} // namespace ZeroTier
--- a/src/crypto/astrobwt/Salsa20.hpp
+++ b/src/crypto/astrobwt/Salsa20.hpp
@ -0,0 +1,52 @@
+/*
+ * Based on public domain code available at: http://cr.yp.to/snuffle.html
+ *
+ * This therefore is public domain.
+ */
+
+#ifndef ZT_SALSA20_HPP
+#define ZT_SALSA20_HPP
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <emmintrin.h>
+
+namespace ZeroTier {
+
+/**
+ * Salsa20 stream cipher
+ */
+class Salsa20
+{
+public:
+	/**
+	 * @param key 256-bit (32 byte) key
+	 * @param iv 64-bit initialization vector
+	 */
+	Salsa20(const void *key,const void *iv)
+	{
+		init(key,iv);
+	}
+
+	/**
+	 * Initialize cipher
+	 *
+	 * @param key Key bits
+	 * @param iv 64-bit initialization vector
+	 */
+	void init(const void *key,const void *iv);
+
+	void XORKeyStream(void *out,unsigned int bytes);
+
+private:
+	union {
+		__m128i v[4];
+		uint32_t i[16];
+	} _state;
+};
+
+} // namespace ZeroTier
+
+#endif
--- a/src/crypto/astrobwt/salsa20_ref/ecrypt-config.h
+++ b/src/crypto/astrobwt/salsa20_ref/ecrypt-config.h
@ -0,0 +1,272 @@
+/* ecrypt-config.h */
+
+/* *** Normally, it should not be necessary to edit this file. *** */
+
+#ifndef ECRYPT_CONFIG
+#define ECRYPT_CONFIG
+
+/* ------------------------------------------------------------------------- */
+
+/* Guess the endianness of the target architecture. */
+
+/* 
+ * The LITTLE endian machines:
+ */
+#if defined(__ultrix)           /* Older MIPS */
+#define ECRYPT_LITTLE_ENDIAN
+#elif defined(__alpha)          /* Alpha */
+#define ECRYPT_LITTLE_ENDIAN
+#elif defined(i386)             /* x86 (gcc) */
+#define ECRYPT_LITTLE_ENDIAN
+#elif defined(__i386)           /* x86 (gcc) */
+#define ECRYPT_LITTLE_ENDIAN
+#elif defined(_M_IX86)          /* x86 (MSC, Borland) */
+#define ECRYPT_LITTLE_ENDIAN
+#elif defined(_MSC_VER)         /* x86 (surely MSC) */
+#define ECRYPT_LITTLE_ENDIAN
+#elif defined(__INTEL_COMPILER) /* x86 (surely Intel compiler icl.exe) */
+#define ECRYPT_LITTLE_ENDIAN
+
+/* 
+ * The BIG endian machines: 
+ */
+#elif defined(sun)              /* Newer Sparc's */
+#define ECRYPT_BIG_ENDIAN
+#elif defined(__ppc__)          /* PowerPC */
+#define ECRYPT_BIG_ENDIAN
+
+/* 
+ * Finally machines with UNKNOWN endianness:
+ */
+#elif defined (_AIX)            /* RS6000 */
+#define ECRYPT_UNKNOWN
+#elif defined(__hpux)           /* HP-PA */
+#define ECRYPT_UNKNOWN
+#elif defined(__aux)            /* 68K */
+#define ECRYPT_UNKNOWN
+#elif defined(__dgux)           /* 88K (but P6 in latest boxes) */
+#define ECRYPT_UNKNOWN
+#elif defined(__sgi)            /* Newer MIPS */
+#define ECRYPT_UNKNOWN
+#else	                        /* Any other processor */
+#define ECRYPT_UNKNOWN
+#endif
+
+/* ------------------------------------------------------------------------- */
+
+/*
+ * Find minimal-width types to store 8-bit, 16-bit, 32-bit, and 64-bit
+ * integers.
+ *
+ * Note: to enable 64-bit types on 32-bit compilers, it might be
+ * necessary to switch from ISO C90 mode to ISO C99 mode (e.g., gcc
+ * -std=c99).
+ */
+
+#include <limits.h>
+
+/* --- check char --- */
+
+#if (UCHAR_MAX / 0xFU > 0xFU)
+#ifndef I8T
+#define I8T char
+#define U8C(v) (v##U)
+
+#if (UCHAR_MAX == 0xFFU)
+#define ECRYPT_I8T_IS_BYTE
+#endif
+
+#endif
+
+#if (UCHAR_MAX / 0xFFU > 0xFFU)
+#ifndef I16T
+#define I16T char
+#define U16C(v) (v##U)
+#endif
+
+#if (UCHAR_MAX / 0xFFFFU > 0xFFFFU)
+#ifndef I32T
+#define I32T char
+#define U32C(v) (v##U)
+#endif
+
+#if (UCHAR_MAX / 0xFFFFFFFFU > 0xFFFFFFFFU)
+#ifndef I64T
+#define I64T char
+#define U64C(v) (v##U)
+#define ECRYPT_NATIVE64
+#endif
+
+#endif
+#endif
+#endif
+#endif
+
+/* --- check short --- */
+
+#if (USHRT_MAX / 0xFU > 0xFU)
+#ifndef I8T
+#define I8T short
+#define U8C(v) (v##U)
+
+#if (USHRT_MAX == 0xFFU)
+#define ECRYPT_I8T_IS_BYTE
+#endif
+
+#endif
+
+#if (USHRT_MAX / 0xFFU > 0xFFU)
+#ifndef I16T
+#define I16T short
+#define U16C(v) (v##U)
+#endif
+
+#if (USHRT_MAX / 0xFFFFU > 0xFFFFU)
+#ifndef I32T
+#define I32T short
+#define U32C(v) (v##U)
+#endif
+
+#if (USHRT_MAX / 0xFFFFFFFFU > 0xFFFFFFFFU)
+#ifndef I64T
+#define I64T short
+#define U64C(v) (v##U)
+#define ECRYPT_NATIVE64
+#endif
+
+#endif
+#endif
+#endif
+#endif
+
+/* --- check int --- */
+
+#if (UINT_MAX / 0xFU > 0xFU)
+#ifndef I8T
+#define I8T int
+#define U8C(v) (v##U)
+
+#if (ULONG_MAX == 0xFFU)
+#define ECRYPT_I8T_IS_BYTE
+#endif
+
+#endif
+
+#if (UINT_MAX / 0xFFU > 0xFFU)
+#ifndef I16T
+#define I16T int
+#define U16C(v) (v##U)
+#endif
+
+#if (UINT_MAX / 0xFFFFU > 0xFFFFU)
+#ifndef I32T
+#define I32T int
+#define U32C(v) (v##U)
+#endif
+
+#if (UINT_MAX / 0xFFFFFFFFU > 0xFFFFFFFFU)
+#ifndef I64T
+#define I64T int
+#define U64C(v) (v##U)
+#define ECRYPT_NATIVE64
+#endif
+
+#endif
+#endif
+#endif
+#endif
+
+/* --- check long --- */
+
+#if (ULONG_MAX / 0xFUL > 0xFUL)
+#ifndef I8T
+#define I8T long
+#define U8C(v) (v##UL)
+
+#if (ULONG_MAX == 0xFFUL)
+#define ECRYPT_I8T_IS_BYTE
+#endif
+
+#endif
+
+#if (ULONG_MAX / 0xFFUL > 0xFFUL)
+#ifndef I16T
+#define I16T long
+#define U16C(v) (v##UL)
+#endif
+
+#if (ULONG_MAX / 0xFFFFUL > 0xFFFFUL)
+#ifndef I32T
+#define I32T long
+#define U32C(v) (v##UL)
+#endif
+
+#if (ULONG_MAX / 0xFFFFFFFFUL > 0xFFFFFFFFUL)
+#ifndef I64T
+#define I64T long
+#define U64C(v) (v##UL)
+#define ECRYPT_NATIVE64
+#endif
+
+#endif
+#endif
+#endif
+#endif
+
+/* --- check long long --- */
+
+#ifdef ULLONG_MAX
+
+#if (ULLONG_MAX / 0xFULL > 0xFULL)
+#ifndef I8T
+#define I8T long long
+#define U8C(v) (v##ULL)
+
+#if (ULLONG_MAX == 0xFFULL)
+#define ECRYPT_I8T_IS_BYTE
+#endif
+
+#endif
+
+#if (ULLONG_MAX / 0xFFULL > 0xFFULL)
+#ifndef I16T
+#define I16T long long
+#define U16C(v) (v##ULL)
+#endif
+
+#if (ULLONG_MAX / 0xFFFFULL > 0xFFFFULL)
+#ifndef I32T
+#define I32T long long
+#define U32C(v) (v##ULL)
+#endif
+
+#if (ULLONG_MAX / 0xFFFFFFFFULL > 0xFFFFFFFFULL)
+#ifndef I64T
+#define I64T long long
+#define U64C(v) (v##ULL)
+#endif
+
+#endif
+#endif
+#endif
+#endif
+
+#endif
+
+/* --- check __int64 --- */
+
+#ifdef _UI64_MAX
+
+#if (_UI64_MAX / 0xFFFFFFFFui64 > 0xFFFFFFFFui64)
+#ifndef I64T
+#define I64T __int64
+#define U64C(v) (v##ui64)
+#endif
+
+#endif
+
+#endif
+
+/* ------------------------------------------------------------------------- */
+
+#endif
--- a/src/crypto/astrobwt/salsa20_ref/ecrypt-machine.h
+++ b/src/crypto/astrobwt/salsa20_ref/ecrypt-machine.h
@ -0,0 +1,46 @@
+/* ecrypt-machine.h */
+
+/*
+ * This file is included by 'ecrypt-portable.h'. It allows to override
+ * the default macros for specific platforms. Please carefully check
+ * the machine code generated by your compiler (with optimisations
+ * turned on) before deciding to edit this file.
+ */
+
+/* ------------------------------------------------------------------------- */
+
+#if (defined(ECRYPT_DEFAULT_ROT) && !defined(ECRYPT_MACHINE_ROT))
+
+#define ECRYPT_MACHINE_ROT
+
+#if (defined(WIN32) && defined(_MSC_VER))
+
+#undef ROTL32
+#undef ROTR32
+#undef ROTL64
+#undef ROTR64
+
+#include <stdlib.h>
+
+#define ROTL32(v, n) _lrotl(v, n)
+#define ROTR32(v, n) _lrotr(v, n)
+#define ROTL64(v, n) _rotl64(v, n)
+#define ROTR64(v, n) _rotr64(v, n)
+
+#endif
+
+#endif
+
+/* ------------------------------------------------------------------------- */
+
+#if (defined(ECRYPT_DEFAULT_SWAP) && !defined(ECRYPT_MACHINE_SWAP))
+
+#define ECRYPT_MACHINE_SWAP
+
+/*
+ * If you want to overwrite the default swap macros, put it here. And so on.
+ */
+
+#endif
+
+/* ------------------------------------------------------------------------- */
--- a/src/crypto/astrobwt/salsa20_ref/ecrypt-portable.h
+++ b/src/crypto/astrobwt/salsa20_ref/ecrypt-portable.h
@ -0,0 +1,303 @@
+/* ecrypt-portable.h */
+
+/*
+ * WARNING: the conversions defined below are implemented as macros,
+ * and should be used carefully. They should NOT be used with
+ * parameters which perform some action. E.g., the following two lines
+ * are not equivalent:
+ * 
+ *  1) ++x; y = ROTL32(x, n); 
+ *  2) y = ROTL32(++x, n);
+ */
+
+/*
+ * *** Please do not edit this file. ***
+ *
+ * The default macros can be overridden for specific architectures by
+ * editing 'ecrypt-machine.h'.
+ */
+
+#ifndef ECRYPT_PORTABLE
+#define ECRYPT_PORTABLE
+
+#include "ecrypt-config.h"
+
+/* ------------------------------------------------------------------------- */
+
+/*
+ * The following types are defined (if available):
+ *
+ * u8:  unsigned integer type, at least 8 bits
+ * u16: unsigned integer type, at least 16 bits
+ * u32: unsigned integer type, at least 32 bits
+ * u64: unsigned integer type, at least 64 bits
+ *
+ * s8, s16, s32, s64 -> signed counterparts of u8, u16, u32, u64
+ *
+ * The selection of minimum-width integer types is taken care of by
+ * 'ecrypt-config.h'. Note: to enable 64-bit types on 32-bit
+ * compilers, it might be necessary to switch from ISO C90 mode to ISO
+ * C99 mode (e.g., gcc -std=c99).
+ */
+
+#ifdef I8T
+typedef signed I8T s8;
+typedef unsigned I8T u8;
+#endif
+
+#ifdef I16T
+typedef signed I16T s16;
+typedef unsigned I16T u16;
+#endif
+
+#ifdef I32T
+typedef signed I32T s32;
+typedef unsigned I32T u32;
+#endif
+
+#ifdef I64T
+typedef signed I64T s64;
+typedef unsigned I64T u64;
+#endif
+
+/*
+ * The following macros are used to obtain exact-width results.
+ */
+
+#define U8V(v) ((u8)(v) & U8C(0xFF))
+#define U16V(v) ((u16)(v) & U16C(0xFFFF))
+#define U32V(v) ((u32)(v) & U32C(0xFFFFFFFF))
+#define U64V(v) ((u64)(v) & U64C(0xFFFFFFFFFFFFFFFF))
+
+/* ------------------------------------------------------------------------- */
+
+/*
+ * The following macros return words with their bits rotated over n
+ * positions to the left/right.
+ */
+
+#define ECRYPT_DEFAULT_ROT
+
+#define ROTL8(v, n) \
+  (U8V((v) << (n)) | ((v) >> (8 - (n))))
+
+#define ROTL16(v, n) \
+  (U16V((v) << (n)) | ((v) >> (16 - (n))))
+
+#define ROTL32(v, n) \
+  (U32V((v) << (n)) | ((v) >> (32 - (n))))
+
+#define ROTL64(v, n) \
+  (U64V((v) << (n)) | ((v) >> (64 - (n))))
+
+#define ROTR8(v, n) ROTL8(v, 8 - (n))
+#define ROTR16(v, n) ROTL16(v, 16 - (n))
+#define ROTR32(v, n) ROTL32(v, 32 - (n))
+#define ROTR64(v, n) ROTL64(v, 64 - (n))
+
+#include "ecrypt-machine.h"
+
+/* ------------------------------------------------------------------------- */
+
+/*
+ * The following macros return a word with bytes in reverse order.
+ */
+
+#define ECRYPT_DEFAULT_SWAP
+
+#define SWAP16(v) \
+  ROTL16(v, 8)
+
+#define SWAP32(v) \
+  ((ROTL32(v,  8) & U32C(0x00FF00FF)) | \
+   (ROTL32(v, 24) & U32C(0xFF00FF00)))
+
+#ifdef ECRYPT_NATIVE64
+#define SWAP64(v) \
+  ((ROTL64(v,  8) & U64C(0x000000FF000000FF)) | \
+   (ROTL64(v, 24) & U64C(0x0000FF000000FF00)) | \
+   (ROTL64(v, 40) & U64C(0x00FF000000FF0000)) | \
+   (ROTL64(v, 56) & U64C(0xFF000000FF000000)))
+#else
+#define SWAP64(v) \
+  (((u64)SWAP32(U32V(v)) << 32) | (u64)SWAP32(U32V(v >> 32)))
+#endif
+
+#include "ecrypt-machine.h"
+
+#define ECRYPT_DEFAULT_WTOW
+
+#ifdef ECRYPT_LITTLE_ENDIAN
+#define U16TO16_LITTLE(v) (v)
+#define U32TO32_LITTLE(v) (v)
+#define U64TO64_LITTLE(v) (v)
+
+#define U16TO16_BIG(v) SWAP16(v)
+#define U32TO32_BIG(v) SWAP32(v)
+#define U64TO64_BIG(v) SWAP64(v)
+#endif
+
+#ifdef ECRYPT_BIG_ENDIAN
+#define U16TO16_LITTLE(v) SWAP16(v)
+#define U32TO32_LITTLE(v) SWAP32(v)
+#define U64TO64_LITTLE(v) SWAP64(v)
+
+#define U16TO16_BIG(v) (v)
+#define U32TO32_BIG(v) (v)
+#define U64TO64_BIG(v) (v)
+#endif
+
+#include "ecrypt-machine.h"
+
+/*
+ * The following macros load words from an array of bytes with
+ * different types of endianness, and vice versa.
+ */
+
+#define ECRYPT_DEFAULT_BTOW
+
+#if (!defined(ECRYPT_UNKNOWN) && defined(ECRYPT_I8T_IS_BYTE))
+
+#define U8TO16_LITTLE(p) U16TO16_LITTLE(((u16*)(p))[0])
+#define U8TO32_LITTLE(p) U32TO32_LITTLE(((u32*)(p))[0])
+#define U8TO64_LITTLE(p) U64TO64_LITTLE(((u64*)(p))[0])
+
+#define U8TO16_BIG(p) U16TO16_BIG(((u16*)(p))[0])
+#define U8TO32_BIG(p) U32TO32_BIG(((u32*)(p))[0])
+#define U8TO64_BIG(p) U64TO64_BIG(((u64*)(p))[0])
+
+#define U16TO8_LITTLE(p, v) (((u16*)(p))[0] = U16TO16_LITTLE(v))
+#define U32TO8_LITTLE(p, v) (((u32*)(p))[0] = U32TO32_LITTLE(v))
+#define U64TO8_LITTLE(p, v) (((u64*)(p))[0] = U64TO64_LITTLE(v))
+
+#define U16TO8_BIG(p, v) (((u16*)(p))[0] = U16TO16_BIG(v))
+#define U32TO8_BIG(p, v) (((u32*)(p))[0] = U32TO32_BIG(v))
+#define U64TO8_BIG(p, v) (((u64*)(p))[0] = U64TO64_BIG(v))
+
+#else
+
+#define U8TO16_LITTLE(p) \
+  (((u16)((p)[0])      ) | \
+   ((u16)((p)[1]) <<  8))
+
+#define U8TO32_LITTLE(p) \
+  (((u32)((p)[0])      ) | \
+   ((u32)((p)[1]) <<  8) | \
+   ((u32)((p)[2]) << 16) | \
+   ((u32)((p)[3]) << 24))
+
+#ifdef ECRYPT_NATIVE64
+#define U8TO64_LITTLE(p) \
+  (((u64)((p)[0])      ) | \
+   ((u64)((p)[1]) <<  8) | \
+   ((u64)((p)[2]) << 16) | \
+   ((u64)((p)[3]) << 24) | \
+   ((u64)((p)[4]) << 32) | \
+   ((u64)((p)[5]) << 40) | \
+   ((u64)((p)[6]) << 48) | \
+   ((u64)((p)[7]) << 56))
+#else
+#define U8TO64_LITTLE(p) \
+  ((u64)U8TO32_LITTLE(p) | ((u64)U8TO32_LITTLE((p) + 4) << 32))
+#endif
+
+#define U8TO16_BIG(p) \
+  (((u16)((p)[0]) <<  8) | \
+   ((u16)((p)[1])      ))
+
+#define U8TO32_BIG(p) \
+  (((u32)((p)[0]) << 24) | \
+   ((u32)((p)[1]) << 16) | \
+   ((u32)((p)[2]) <<  8) | \
+   ((u32)((p)[3])      ))
+
+#ifdef ECRYPT_NATIVE64
+#define U8TO64_BIG(p) \
+  (((u64)((p)[0]) << 56) | \
+   ((u64)((p)[1]) << 48) | \
+   ((u64)((p)[2]) << 40) | \
+   ((u64)((p)[3]) << 32) | \
+   ((u64)((p)[4]) << 24) | \
+   ((u64)((p)[5]) << 16) | \
+   ((u64)((p)[6]) <<  8) | \
+   ((u64)((p)[7])      ))
+#else
+#define U8TO64_BIG(p) \
+  (((u64)U8TO32_BIG(p) << 32) | (u64)U8TO32_BIG((p) + 4))
+#endif
+
+#define U16TO8_LITTLE(p, v) \
+  do { \
+    (p)[0] = U8V((v)      ); \
+    (p)[1] = U8V((v) >>  8); \
+  } while (0)
+
+#define U32TO8_LITTLE(p, v) \
+  do { \
+    (p)[0] = U8V((v)      ); \
+    (p)[1] = U8V((v) >>  8); \
+    (p)[2] = U8V((v) >> 16); \
+    (p)[3] = U8V((v) >> 24); \
+  } while (0)
+
+#ifdef ECRYPT_NATIVE64
+#define U64TO8_LITTLE(p, v) \
+  do { \
+    (p)[0] = U8V((v)      ); \
+    (p)[1] = U8V((v) >>  8); \
+    (p)[2] = U8V((v) >> 16); \
+    (p)[3] = U8V((v) >> 24); \
+    (p)[4] = U8V((v) >> 32); \
+    (p)[5] = U8V((v) >> 40); \
+    (p)[6] = U8V((v) >> 48); \
+    (p)[7] = U8V((v) >> 56); \
+  } while (0)
+#else
+#define U64TO8_LITTLE(p, v) \
+  do { \
+    U32TO8_LITTLE((p),     U32V((v)      )); \
+    U32TO8_LITTLE((p) + 4, U32V((v) >> 32)); \
+  } while (0)
+#endif
+
+#define U16TO8_BIG(p, v) \
+  do { \
+    (p)[0] = U8V((v)      ); \
+    (p)[1] = U8V((v) >>  8); \
+  } while (0)
+
+#define U32TO8_BIG(p, v) \
+  do { \
+    (p)[0] = U8V((v) >> 24); \
+    (p)[1] = U8V((v) >> 16); \
+    (p)[2] = U8V((v) >>  8); \
+    (p)[3] = U8V((v)      ); \
+  } while (0)
+
+#ifdef ECRYPT_NATIVE64
+#define U64TO8_BIG(p, v) \
+  do { \
+    (p)[0] = U8V((v) >> 56); \
+    (p)[1] = U8V((v) >> 48); \
+    (p)[2] = U8V((v) >> 40); \
+    (p)[3] = U8V((v) >> 32); \
+    (p)[4] = U8V((v) >> 24); \
+    (p)[5] = U8V((v) >> 16); \
+    (p)[6] = U8V((v) >>  8); \
+    (p)[7] = U8V((v)      ); \
+  } while (0)
+#else
+#define U64TO8_BIG(p, v) \
+  do { \
+    U32TO8_BIG((p),     U32V((v) >> 32)); \
+    U32TO8_BIG((p) + 4, U32V((v)      )); \
+  } while (0)
+#endif
+
+#endif
+
+#include "ecrypt-machine.h"
+
+/* ------------------------------------------------------------------------- */
+
+#endif
--- a/src/crypto/astrobwt/salsa20_ref/ecrypt-sync.h
+++ b/src/crypto/astrobwt/salsa20_ref/ecrypt-sync.h
@ -0,0 +1,279 @@
+/* ecrypt-sync.h */
+
+/* 
+ * Header file for synchronous stream ciphers without authentication
+ * mechanism.
+ * 
+ * *** Please only edit parts marked with "[edit]". ***
+ */
+
+#ifndef ECRYPT_SYNC
+#define ECRYPT_SYNC
+
+#include "ecrypt-portable.h"
+
+/* ------------------------------------------------------------------------- */
+
+/* Cipher parameters */
+
+/* 
+ * The name of your cipher.
+ */
+#define ECRYPT_NAME "Salsa20"    /* [edit] */ 
+#define ECRYPT_PROFILE "S!_H."
+
+/*
+ * Specify which key and IV sizes are supported by your cipher. A user
+ * should be able to enumerate the supported sizes by running the
+ * following code:
+ *
+ * for (i = 0; ECRYPT_KEYSIZE(i) <= ECRYPT_MAXKEYSIZE; ++i)
+ *   {
+ *     keysize = ECRYPT_KEYSIZE(i);
+ *
+ *     ...
+ *   }
+ *
+ * All sizes are in bits.
+ */
+
+#define ECRYPT_MAXKEYSIZE 256                 /* [edit] */
+#define ECRYPT_KEYSIZE(i) (128 + (i)*128)     /* [edit] */
+
+#define ECRYPT_MAXIVSIZE 64                   /* [edit] */
+#define ECRYPT_IVSIZE(i) (64 + (i)*64)        /* [edit] */
+
+/* ------------------------------------------------------------------------- */
+
+/* Data structures */
+
+/* 
+ * ECRYPT_ctx is the structure containing the representation of the
+ * internal state of your cipher. 
+ */
+
+typedef struct
+{
+  u32 input[16]; /* could be compressed */
+  /* 
+   * [edit]
+   *
+   * Put here all state variable needed during the encryption process.
+   */
+} ECRYPT_ctx;
+
+/* ------------------------------------------------------------------------- */
+
+/* Mandatory functions */
+
+/*
+ * Key and message independent initialization. This function will be
+ * called once when the program starts (e.g., to build expanded S-box
+ * tables).
+ */
+void ECRYPT_init();
+
+/*
+ * Key setup. It is the user's responsibility to select the values of
+ * keysize and ivsize from the set of supported values specified
+ * above.
+ */
+void ECRYPT_keysetup(
+  ECRYPT_ctx* ctx, 
+  const u8* key, 
+  u32 keysize,                /* Key size in bits. */ 
+  u32 ivsize);                /* IV size in bits. */ 
+
+/*
+ * IV setup. After having called ECRYPT_keysetup(), the user is
+ * allowed to call ECRYPT_ivsetup() different times in order to
+ * encrypt/decrypt different messages with the same key but different
+ * IV's.
+ */
+void ECRYPT_ivsetup(
+  ECRYPT_ctx* ctx, 
+  const u8* iv);
+
+/*
+ * Encryption/decryption of arbitrary length messages.
+ *
+ * For efficiency reasons, the API provides two types of
+ * encrypt/decrypt functions. The ECRYPT_encrypt_bytes() function
+ * (declared here) encrypts byte strings of arbitrary length, while
+ * the ECRYPT_encrypt_blocks() function (defined later) only accepts
+ * lengths which are multiples of ECRYPT_BLOCKLENGTH.
+ * 
+ * The user is allowed to make multiple calls to
+ * ECRYPT_encrypt_blocks() to incrementally encrypt a long message,
+ * but he is NOT allowed to make additional encryption calls once he
+ * has called ECRYPT_encrypt_bytes() (unless he starts a new message
+ * of course). For example, this sequence of calls is acceptable:
+ *
+ * ECRYPT_keysetup();
+ *
+ * ECRYPT_ivsetup();
+ * ECRYPT_encrypt_blocks();
+ * ECRYPT_encrypt_blocks();
+ * ECRYPT_encrypt_bytes();
+ *
+ * ECRYPT_ivsetup();
+ * ECRYPT_encrypt_blocks();
+ * ECRYPT_encrypt_blocks();
+ *
+ * ECRYPT_ivsetup();
+ * ECRYPT_encrypt_bytes();
+ * 
+ * The following sequence is not:
+ *
+ * ECRYPT_keysetup();
+ * ECRYPT_ivsetup();
+ * ECRYPT_encrypt_blocks();
+ * ECRYPT_encrypt_bytes();
+ * ECRYPT_encrypt_blocks();
+ */
+
+void ECRYPT_encrypt_bytes(
+  ECRYPT_ctx* ctx, 
+  const u8* plaintext, 
+  u8* ciphertext, 
+  u32 msglen);                /* Message length in bytes. */ 
+
+void ECRYPT_decrypt_bytes(
+  ECRYPT_ctx* ctx, 
+  const u8* ciphertext, 
+  u8* plaintext, 
+  u32 msglen);                /* Message length in bytes. */ 
+
+/* ------------------------------------------------------------------------- */
+
+/* Optional features */
+
+/* 
+ * For testing purposes it can sometimes be useful to have a function
+ * which immediately generates keystream without having to provide it
+ * with a zero plaintext. If your cipher cannot provide this function
+ * (e.g., because it is not strictly a synchronous cipher), please
+ * reset the ECRYPT_GENERATES_KEYSTREAM flag.
+ */
+
+#define ECRYPT_GENERATES_KEYSTREAM
+#ifdef ECRYPT_GENERATES_KEYSTREAM
+
+void ECRYPT_keystream_bytes(
+  ECRYPT_ctx* ctx,
+  u8* keystream,
+  u32 length);                /* Length of keystream in bytes. */
+
+#endif
+
+/* ------------------------------------------------------------------------- */
+
+/* Optional optimizations */
+
+/* 
+ * By default, the functions in this section are implemented using
+ * calls to functions declared above. However, you might want to
+ * implement them differently for performance reasons.
+ */
+
+/*
+ * All-in-one encryption/decryption of (short) packets.
+ *
+ * The default definitions of these functions can be found in
+ * "ecrypt-sync.c". If you want to implement them differently, please
+ * undef the ECRYPT_USES_DEFAULT_ALL_IN_ONE flag.
+ */
+#define ECRYPT_USES_DEFAULT_ALL_IN_ONE        /* [edit] */
+
+void ECRYPT_encrypt_packet(
+  ECRYPT_ctx* ctx, 
+  const u8* iv,
+  const u8* plaintext, 
+  u8* ciphertext, 
+  u32 msglen);
+
+void ECRYPT_decrypt_packet(
+  ECRYPT_ctx* ctx, 
+  const u8* iv,
+  const u8* ciphertext, 
+  u8* plaintext, 
+  u32 msglen);
+
+/*
+ * Encryption/decryption of blocks.
+ * 
+ * By default, these functions are defined as macros. If you want to
+ * provide a different implementation, please undef the
+ * ECRYPT_USES_DEFAULT_BLOCK_MACROS flag and implement the functions
+ * declared below.
+ */
+
+#define ECRYPT_BLOCKLENGTH 64                  /* [edit] */
+
+#define ECRYPT_USES_DEFAULT_BLOCK_MACROS      /* [edit] */
+#ifdef ECRYPT_USES_DEFAULT_BLOCK_MACROS
+
+#define ECRYPT_encrypt_blocks(ctx, plaintext, ciphertext, blocks)  \
+  ECRYPT_encrypt_bytes(ctx, plaintext, ciphertext,                 \
+    (blocks) * ECRYPT_BLOCKLENGTH)
+
+#define ECRYPT_decrypt_blocks(ctx, ciphertext, plaintext, blocks)  \
+  ECRYPT_decrypt_bytes(ctx, ciphertext, plaintext,                 \
+    (blocks) * ECRYPT_BLOCKLENGTH)
+
+#ifdef ECRYPT_GENERATES_KEYSTREAM
+
+#define ECRYPT_keystream_blocks(ctx, keystream, blocks)            \
+  ECRYPT_keystream_bytes(ctx, keystream,                        \
+    (blocks) * ECRYPT_BLOCKLENGTH)
+
+#endif
+
+#else
+
+void ECRYPT_encrypt_blocks(
+  ECRYPT_ctx* ctx, 
+  const u8* plaintext, 
+  u8* ciphertext, 
+  u32 blocks);                /* Message length in blocks. */ 
+
+void ECRYPT_decrypt_blocks(
+  ECRYPT_ctx* ctx, 
+  const u8* ciphertext, 
+  u8* plaintext, 
+  u32 blocks);                /* Message length in blocks. */ 
+
+#ifdef ECRYPT_GENERATES_KEYSTREAM
+
+void ECRYPT_keystream_blocks(
+  ECRYPT_ctx* ctx,
+  const u8* keystream,
+  u32 blocks);                /* Keystream length in blocks. */ 
+
+#endif
+
+#endif
+
+/*
+ * If your cipher can be implemented in different ways, you can use
+ * the ECRYPT_VARIANT parameter to allow the user to choose between
+ * them at compile time (e.g., gcc -DECRYPT_VARIANT=3 ...). Please
+ * only use this possibility if you really think it could make a
+ * significant difference and keep the number of variants
+ * (ECRYPT_MAXVARIANT) as small as possible (definitely not more than
+ * 10). Note also that all variants should have exactly the same
+ * external interface (i.e., the same ECRYPT_BLOCKLENGTH, etc.). 
+ */
+#define ECRYPT_MAXVARIANT 1                   /* [edit] */
+
+#ifndef ECRYPT_VARIANT
+#define ECRYPT_VARIANT 1
+#endif
+
+#if (ECRYPT_VARIANT > ECRYPT_MAXVARIANT)
+#error this variant does not exist
+#endif
+
+/* ------------------------------------------------------------------------- */
+
+#endif
--- a/src/crypto/astrobwt/salsa20_ref/salsa20.c
+++ b/src/crypto/astrobwt/salsa20_ref/salsa20.c
@ -0,0 +1,219 @@
+/*
+salsa20-merged.c version 20051118
+D. J. Bernstein
+Public domain.
+*/
+
+#include "ecrypt-sync.h"
+
+#define ROTATE(v,c) (ROTL32(v,c))
+#define XOR(v,w) ((v) ^ (w))
+#define PLUS(v,w) (U32V((v) + (w)))
+#define PLUSONE(v) (PLUS((v),1))
+
+void ECRYPT_init(void)
+{
+  return;
+}
+
+static const char sigma[16] = "expand 32-byte k";
+static const char tau[16] = "expand 16-byte k";
+
+void ECRYPT_keysetup(ECRYPT_ctx *x,const u8 *k,u32 kbits,u32 ivbits)
+{
+  const char *constants;
+
+  x->input[1] = U8TO32_LITTLE(k + 0);
+  x->input[2] = U8TO32_LITTLE(k + 4);
+  x->input[3] = U8TO32_LITTLE(k + 8);
+  x->input[4] = U8TO32_LITTLE(k + 12);
+  if (kbits == 256) { /* recommended */
+    k += 16;
+    constants = sigma;
+  } else { /* kbits == 128 */
+    constants = tau;
+  }
+  x->input[11] = U8TO32_LITTLE(k + 0);
+  x->input[12] = U8TO32_LITTLE(k + 4);
+  x->input[13] = U8TO32_LITTLE(k + 8);
+  x->input[14] = U8TO32_LITTLE(k + 12);
+  x->input[0] = U8TO32_LITTLE(constants + 0);
+  x->input[5] = U8TO32_LITTLE(constants + 4);
+  x->input[10] = U8TO32_LITTLE(constants + 8);
+  x->input[15] = U8TO32_LITTLE(constants + 12);
+}
+
+void ECRYPT_ivsetup(ECRYPT_ctx *x,const u8 *iv)
+{
+  x->input[6] = U8TO32_LITTLE(iv + 0);
+  x->input[7] = U8TO32_LITTLE(iv + 4);
+  x->input[8] = 0;
+  x->input[9] = 0;
+}
+
+void ECRYPT_encrypt_bytes(ECRYPT_ctx *x,const u8 *m,u8 *c,u32 bytes)
+{
+  u32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+  u32 j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15;
+  u8 *ctarget = 0;
+  u8 tmp[64];
+  int i;
+
+  if (!bytes) return;
+
+  j0 = x->input[0];
+  j1 = x->input[1];
+  j2 = x->input[2];
+  j3 = x->input[3];
+  j4 = x->input[4];
+  j5 = x->input[5];
+  j6 = x->input[6];
+  j7 = x->input[7];
+  j8 = x->input[8];
+  j9 = x->input[9];
+  j10 = x->input[10];
+  j11 = x->input[11];
+  j12 = x->input[12];
+  j13 = x->input[13];
+  j14 = x->input[14];
+  j15 = x->input[15];
+
+  for (;;) {
+    if (bytes < 64) {
+      for (i = 0;i < bytes;++i) tmp[i] = m[i];
+      m = tmp;
+      ctarget = c;
+      c = tmp;
+    }
+    x0 = j0;
+    x1 = j1;
+    x2 = j2;
+    x3 = j3;
+    x4 = j4;
+    x5 = j5;
+    x6 = j6;
+    x7 = j7;
+    x8 = j8;
+    x9 = j9;
+    x10 = j10;
+    x11 = j11;
+    x12 = j12;
+    x13 = j13;
+    x14 = j14;
+    x15 = j15;
+    for (i = 20;i > 0;i -= 2) {
+       x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+       x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+      x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+       x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+       x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+      x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+       x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+       x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+      x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+       x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+       x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+      x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+       x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+       x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+      x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+      x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+       x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+       x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+       x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+       x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+       x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+       x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+       x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+       x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+      x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+       x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+       x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+      x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+      x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+      x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+      x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+      x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+    }
+    x0 = PLUS(x0,j0);
+    x1 = PLUS(x1,j1);
+    x2 = PLUS(x2,j2);
+    x3 = PLUS(x3,j3);
+    x4 = PLUS(x4,j4);
+    x5 = PLUS(x5,j5);
+    x6 = PLUS(x6,j6);
+    x7 = PLUS(x7,j7);
+    x8 = PLUS(x8,j8);
+    x9 = PLUS(x9,j9);
+    x10 = PLUS(x10,j10);
+    x11 = PLUS(x11,j11);
+    x12 = PLUS(x12,j12);
+    x13 = PLUS(x13,j13);
+    x14 = PLUS(x14,j14);
+    x15 = PLUS(x15,j15);
+
+    x0 = XOR(x0,U8TO32_LITTLE(m + 0));
+    x1 = XOR(x1,U8TO32_LITTLE(m + 4));
+    x2 = XOR(x2,U8TO32_LITTLE(m + 8));
+    x3 = XOR(x3,U8TO32_LITTLE(m + 12));
+    x4 = XOR(x4,U8TO32_LITTLE(m + 16));
+    x5 = XOR(x5,U8TO32_LITTLE(m + 20));
+    x6 = XOR(x6,U8TO32_LITTLE(m + 24));
+    x7 = XOR(x7,U8TO32_LITTLE(m + 28));
+    x8 = XOR(x8,U8TO32_LITTLE(m + 32));
+    x9 = XOR(x9,U8TO32_LITTLE(m + 36));
+    x10 = XOR(x10,U8TO32_LITTLE(m + 40));
+    x11 = XOR(x11,U8TO32_LITTLE(m + 44));
+    x12 = XOR(x12,U8TO32_LITTLE(m + 48));
+    x13 = XOR(x13,U8TO32_LITTLE(m + 52));
+    x14 = XOR(x14,U8TO32_LITTLE(m + 56));
+    x15 = XOR(x15,U8TO32_LITTLE(m + 60));
+
+    j8 = PLUSONE(j8);
+    if (!j8) {
+      j9 = PLUSONE(j9);
+      /* stopping at 2^70 bytes per nonce is user's responsibility */
+    }
+
+    U32TO8_LITTLE(c + 0,x0);
+    U32TO8_LITTLE(c + 4,x1);
+    U32TO8_LITTLE(c + 8,x2);
+    U32TO8_LITTLE(c + 12,x3);
+    U32TO8_LITTLE(c + 16,x4);
+    U32TO8_LITTLE(c + 20,x5);
+    U32TO8_LITTLE(c + 24,x6);
+    U32TO8_LITTLE(c + 28,x7);
+    U32TO8_LITTLE(c + 32,x8);
+    U32TO8_LITTLE(c + 36,x9);
+    U32TO8_LITTLE(c + 40,x10);
+    U32TO8_LITTLE(c + 44,x11);
+    U32TO8_LITTLE(c + 48,x12);
+    U32TO8_LITTLE(c + 52,x13);
+    U32TO8_LITTLE(c + 56,x14);
+    U32TO8_LITTLE(c + 60,x15);
+
+    if (bytes <= 64) {
+      if (bytes < 64) {
+        for (i = 0;i < bytes;++i) ctarget[i] = c[i];
+      }
+      x->input[8] = j8;
+      x->input[9] = j9;
+      return;
+    }
+    bytes -= 64;
+    c += 64;
+    m += 64;
+  }
+}
+
+void ECRYPT_decrypt_bytes(ECRYPT_ctx *x,const u8 *c,u8 *m,u32 bytes)
+{
+  ECRYPT_encrypt_bytes(x,c,m,bytes);
+}
+
+void ECRYPT_keystream_bytes(ECRYPT_ctx *x,u8 *stream,u32 bytes)
+{
+  u32 i;
+  for (i = 0; i < bytes; ++i) stream[i] = 0;
+  ECRYPT_encrypt_bytes(x,stream,stream,bytes);
+}
--- a/src/crypto/astrobwt/sha3.cpp
+++ b/src/crypto/astrobwt/sha3.cpp
@ -0,0 +1,258 @@
+/* -------------------------------------------------------------------------
+ * Works when compiled for either 32-bit or 64-bit targets, optimized for 
+ * 64 bit.
+ *
+ * Canonical implementation of Init/Update/Finalize for SHA-3 byte input. 
+ *
+ * SHA3-256, SHA3-384, SHA-512 are implemented. SHA-224 can easily be added.
+ *
+ * Based on code from http://keccak.noekeon.org/ .
+ *
+ * I place the code that I wrote into public domain, free to use. 
+ *
+ * I would appreciate if you give credits to this work if you used it to 
+ * write or test * your code.
+ *
+ * Aug 2015. Andrey Jivsov. crypto@brainhub.org
+ * ---------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "sha3.h"
+#include "crypto/common/keccak.h"
+
+#define SHA3_ASSERT( x )
+#if defined(_MSC_VER)
+#define SHA3_TRACE( format, ...)
+#define SHA3_TRACE_BUF( format, buf, l, ...)
+#else
+#define SHA3_TRACE(format, args...)
+#define SHA3_TRACE_BUF(format, buf, l, args...)
+#endif
+
+/* 
+ * This flag is used to configure "pure" Keccak, as opposed to NIST SHA3.
+ */
+#define SHA3_USE_KECCAK_FLAG 0x80000000
+#define SHA3_CW(x) ((x) & (~SHA3_USE_KECCAK_FLAG))
+
+
+#if defined(_MSC_VER)
+#define SHA3_CONST(x) x
+#else
+#define SHA3_CONST(x) x##L
+#endif
+
+#define KECCAK_ROUNDS 24
+
+
+/* *************************** Public Inteface ************************ */
+
+/* For Init or Reset call these: */
+sha3_return_t
+sha3_Init(void *priv, unsigned bitSize) {
+    sha3_context *ctx = (sha3_context *) priv;
+    if( bitSize != 256 && bitSize != 384 && bitSize != 512 )
+        return SHA3_RETURN_BAD_PARAMS;
+    memset(ctx, 0, sizeof(*ctx));
+    ctx->capacityWords = 2 * bitSize / (8 * sizeof(uint64_t));
+    return SHA3_RETURN_OK;
+}
+
+void
+sha3_Init256(void *priv)
+{
+    sha3_Init(priv, 256);
+}
+
+void
+sha3_Init384(void *priv)
+{
+    sha3_Init(priv, 384);
+}
+
+void
+sha3_Init512(void *priv)
+{
+    sha3_Init(priv, 512);
+}
+
+SHA3_FLAGS
+sha3_SetFlags(void *priv, SHA3_FLAGS flags)
+{
+    sha3_context *ctx = (sha3_context *) priv;
+    flags = static_cast<SHA3_FLAGS>(static_cast<int>(flags) & SHA3_FLAGS_KECCAK);
+    ctx->capacityWords |= (flags == SHA3_FLAGS_KECCAK ? SHA3_USE_KECCAK_FLAG : 0);
+    return flags;
+}
+
+
+void
+sha3_Update(void *priv, void const *bufIn, size_t len)
+{
+    sha3_context *ctx = (sha3_context *) priv;
+
+    /* 0...7 -- how much is needed to have a word */
+    unsigned old_tail = (8 - ctx->byteIndex) & 7;
+
+    size_t words;
+    unsigned tail;
+    size_t i;
+
+    const uint8_t *buf = reinterpret_cast<const uint8_t*>(bufIn);
+
+    SHA3_TRACE_BUF("called to update with:", buf, len);
+
+    SHA3_ASSERT(ctx->byteIndex < 8);
+    SHA3_ASSERT(ctx->wordIndex < sizeof(ctx->s) / sizeof(ctx->s[0]));
+
+    if(len < old_tail) {        /* have no complete word or haven't started 
+                                 * the word yet */
+        SHA3_TRACE("because %d<%d, store it and return", (unsigned)len,
+                (unsigned)old_tail);
+        /* endian-independent code follows: */
+        while (len--)
+            ctx->saved |= (uint64_t) (*(buf++)) << ((ctx->byteIndex++) * 8);
+        SHA3_ASSERT(ctx->byteIndex < 8);
+        return;
+    }
+
+    if(old_tail) {              /* will have one word to process */
+        SHA3_TRACE("completing one word with %d bytes", (unsigned)old_tail);
+        /* endian-independent code follows: */
+        len -= old_tail;
+        while (old_tail--)
+            ctx->saved |= (uint64_t) (*(buf++)) << ((ctx->byteIndex++) * 8);
+
+        /* now ready to add saved to the sponge */
+        ctx->s[ctx->wordIndex] ^= ctx->saved;
+        SHA3_ASSERT(ctx->byteIndex == 8);
+        ctx->byteIndex = 0;
+        ctx->saved = 0;
+        if(++ctx->wordIndex ==
+                (SHA3_KECCAK_SPONGE_WORDS - SHA3_CW(ctx->capacityWords))) {
+            xmrig::keccakf(ctx->s, KECCAK_ROUNDS);
+            ctx->wordIndex = 0;
+        }
+    }
+
+    /* now work in full words directly from input */
+
+    SHA3_ASSERT(ctx->byteIndex == 0);
+
+    words = len / sizeof(uint64_t);
+    tail = len - words * sizeof(uint64_t);
+
+    SHA3_TRACE("have %d full words to process", (unsigned)words);
+
+    for(i = 0; i < words; i++, buf += sizeof(uint64_t)) {
+        const uint64_t t = (uint64_t) (buf[0]) |
+                ((uint64_t) (buf[1]) << 8 * 1) |
+                ((uint64_t) (buf[2]) << 8 * 2) |
+                ((uint64_t) (buf[3]) << 8 * 3) |
+                ((uint64_t) (buf[4]) << 8 * 4) |
+                ((uint64_t) (buf[5]) << 8 * 5) |
+                ((uint64_t) (buf[6]) << 8 * 6) |
+                ((uint64_t) (buf[7]) << 8 * 7);
+#if defined(__x86_64__ ) || defined(__i386__)
+        SHA3_ASSERT(memcmp(&t, buf, 8) == 0);
+#endif
+        ctx->s[ctx->wordIndex] ^= t;
+        if(++ctx->wordIndex ==
+                (SHA3_KECCAK_SPONGE_WORDS - SHA3_CW(ctx->capacityWords))) {
+            xmrig::keccakf(ctx->s, KECCAK_ROUNDS);
+            ctx->wordIndex = 0;
+        }
+    }
+
+    SHA3_TRACE("have %d bytes left to process, save them", (unsigned)tail);
+
+    /* finally, save the partial word */
+    SHA3_ASSERT(ctx->byteIndex == 0 && tail < 8);
+    while (tail--) {
+        SHA3_TRACE("Store byte %02x '%c'", *buf, *buf);
+        ctx->saved |= (uint64_t) (*(buf++)) << ((ctx->byteIndex++) * 8);
+    }
+    SHA3_ASSERT(ctx->byteIndex < 8);
+    SHA3_TRACE("Have saved=0x%016" PRIx64 " at the end", ctx->saved);
+}
+
+/* This is simply the 'update' with the padding block.
+ * The padding block is 0x01 || 0x00* || 0x80. First 0x01 and last 0x80 
+ * bytes are always present, but they can be the same byte.
+ */
+void const *
+sha3_Finalize(void *priv)
+{
+    sha3_context *ctx = (sha3_context *) priv;
+
+    SHA3_TRACE("called with %d bytes in the buffer", ctx->byteIndex);
+
+    /* Append 2-bit suffix 01, per SHA-3 spec. Instead of 1 for padding we
+     * use 1<<2 below. The 0x02 below corresponds to the suffix 01.
+     * Overall, we feed 0, then 1, and finally 1 to start padding. Without
+     * M || 01, we would simply use 1 to start padding. */
+
+    uint64_t t;
+
+    if( ctx->capacityWords & SHA3_USE_KECCAK_FLAG ) {
+        /* Keccak version */
+        t = (uint64_t)(((uint64_t) 1) << (ctx->byteIndex * 8));
+    }
+    else {
+        /* SHA3 version */
+        t = (uint64_t)(((uint64_t)(0x02 | (1 << 2))) << ((ctx->byteIndex) * 8));
+    }
+
+    ctx->s[ctx->wordIndex] ^= ctx->saved ^ t;
+
+    ctx->s[SHA3_KECCAK_SPONGE_WORDS - SHA3_CW(ctx->capacityWords) - 1] ^=
+            SHA3_CONST(0x8000000000000000UL);
+    xmrig::keccakf(ctx->s, KECCAK_ROUNDS);
+
+    /* Return first bytes of the ctx->s. This conversion is not needed for
+     * little-endian platforms e.g. wrap with #if !defined(__BYTE_ORDER__)
+     * || !defined(__ORDER_LITTLE_ENDIAN__) || __BYTE_ORDER__!=__ORDER_LITTLE_ENDIAN__ 
+     *    ... the conversion below ...
+     * #endif */
+    {
+        unsigned i;
+        for(i = 0; i < SHA3_KECCAK_SPONGE_WORDS; i++) {
+            const unsigned t1 = (uint32_t) ctx->s[i];
+            const unsigned t2 = (uint32_t) ((ctx->s[i] >> 16) >> 16);
+            ctx->sb[i * 8 + 0] = (uint8_t) (t1);
+            ctx->sb[i * 8 + 1] = (uint8_t) (t1 >> 8);
+            ctx->sb[i * 8 + 2] = (uint8_t) (t1 >> 16);
+            ctx->sb[i * 8 + 3] = (uint8_t) (t1 >> 24);
+            ctx->sb[i * 8 + 4] = (uint8_t) (t2);
+            ctx->sb[i * 8 + 5] = (uint8_t) (t2 >> 8);
+            ctx->sb[i * 8 + 6] = (uint8_t) (t2 >> 16);
+            ctx->sb[i * 8 + 7] = (uint8_t) (t2 >> 24);
+        }
+    }
+
+    SHA3_TRACE_BUF("Hash: (first 32 bytes)", ctx->sb, 256 / 8);
+
+    return (ctx->sb);
+}
+
+sha3_return_t sha3_HashBuffer( unsigned bitSize, enum SHA3_FLAGS flags, const void *in, unsigned inBytes, void *out, unsigned outBytes ) {
+    sha3_return_t err;
+    sha3_context c;
+
+    err = sha3_Init(&c, bitSize);
+    if( err != SHA3_RETURN_OK )
+        return err;
+    if( sha3_SetFlags(&c, flags) != flags ) {
+        return SHA3_RETURN_BAD_PARAMS;
+    }
+    sha3_Update(&c, in, inBytes);
+    const void *h = sha3_Finalize(&c);
+
+    if(outBytes > bitSize/8)
+        outBytes = bitSize/8;
+    memcpy(out, h, outBytes);
+    return SHA3_RETURN_OK;
+}
--- a/src/crypto/astrobwt/sha3.h
+++ b/src/crypto/astrobwt/sha3.h
@ -0,0 +1,71 @@
+#ifndef SHA3_H
+#define SHA3_H
+
+/* -------------------------------------------------------------------------
+ * Works when compiled for either 32-bit or 64-bit targets, optimized for 
+ * 64 bit.
+ *
+ * Canonical implementation of Init/Update/Finalize for SHA-3 byte input. 
+ *
+ * SHA3-256, SHA3-384, SHA-512 are implemented. SHA-224 can easily be added.
+ *
+ * Based on code from http://keccak.noekeon.org/ .
+ *
+ * I place the code that I wrote into public domain, free to use. 
+ *
+ * I would appreciate if you give credits to this work if you used it to 
+ * write or test * your code.
+ *
+ * Aug 2015. Andrey Jivsov. crypto@brainhub.org
+ * ---------------------------------------------------------------------- */
+
+/* 'Words' here refers to uint64_t */
+#define SHA3_KECCAK_SPONGE_WORDS \
+	(((1600)/8/*bits to byte*/)/sizeof(uint64_t))
+typedef struct sha3_context_ {
+    uint64_t saved;             /* the portion of the input message that we
+                                 * didn't consume yet */
+    union {                     /* Keccak's state */
+        uint64_t s[SHA3_KECCAK_SPONGE_WORDS];
+        uint8_t sb[SHA3_KECCAK_SPONGE_WORDS * 8];
+    };
+    unsigned byteIndex;         /* 0..7--the next byte after the set one
+                                 * (starts from 0; 0--none are buffered) */
+    unsigned wordIndex;         /* 0..24--the next word to integrate input
+                                 * (starts from 0) */
+    unsigned capacityWords;     /* the double size of the hash output in
+                                 * words (e.g. 16 for Keccak 512) */
+} sha3_context;
+
+enum SHA3_FLAGS {
+    SHA3_FLAGS_NONE=0,
+    SHA3_FLAGS_KECCAK=1
+};
+
+enum SHA3_RETURN {
+    SHA3_RETURN_OK=0,
+    SHA3_RETURN_BAD_PARAMS=1
+};
+typedef enum SHA3_RETURN sha3_return_t;
+
+/* For Init or Reset call these: */
+sha3_return_t sha3_Init(void *priv, unsigned bitSize);
+
+void sha3_Init256(void *priv);
+void sha3_Init384(void *priv);
+void sha3_Init512(void *priv);
+
+SHA3_FLAGS sha3_SetFlags(void *priv, SHA3_FLAGS);
+
+void sha3_Update(void *priv, void const *bufIn, size_t len);
+
+void const *sha3_Finalize(void *priv);
+
+/* Single-call hashing */
+sha3_return_t sha3_HashBuffer( 
+    unsigned bitSize,   /* 256, 384, 512 */
+    SHA3_FLAGS flags, /* SHA3_FLAGS_NONE or SHA3_FLAGS_KECCAK */
+    const void *in, unsigned inBytes, 
+    void *out, unsigned outBytes );     /* up to bitSize/8; truncation OK */
+
+#endif
--- a/src/crypto/cn/CnHash.cpp
+++ b/src/crypto/cn/CnHash.cpp
@ -43,6 +43,11 @@
 #endif


+#ifdef XMRIG_ALGO_ASTROBWT
+#   include "crypto/astrobwt/AstroBWT.h"
+#endif
+
+
 #define ADD_FN(algo) \
    m_map[algo][AV_SINGLE][Assembly::NONE]      = cryptonight_single_hash<algo, false>; \
    m_map[algo][AV_SINGLE_SOFT][Assembly::NONE] = cryptonight_single_hash<algo, true>;  \
@ -277,6 +282,11 @@ xmrig::CnHash::CnHash()
    m_map[Algorithm::AR2_WRKZ][AV_SINGLE_SOFT][Assembly::NONE]   = argon2::single_hash<Algorithm::AR2_WRKZ>;
 #   endif

+#   ifdef XMRIG_ALGO_ASTROBWT
+    m_map[Algorithm::ASTROBWT_DERO][AV_SINGLE][Assembly::NONE]      = astrobwt::single_hash<Algorithm::ASTROBWT_DERO>;
+    m_map[Algorithm::ASTROBWT_DERO][AV_SINGLE_SOFT][Assembly::NONE] = astrobwt::single_hash<Algorithm::ASTROBWT_DERO>;
+#   endif
+
 #   ifdef XMRIG_FEATURE_ASM
    patchAsmVariants();
 #   endif
--- a/src/crypto/cn/CryptoNight_test.h
+++ b/src/crypto/cn/CryptoNight_test.h
@ -404,6 +404,24 @@ const static uint8_t argon2_wrkz_test_out[160] = {
 #endif


+#ifdef XMRIG_ALGO_ASTROBWT
+// "astrobwt/dero"
+const static uint8_t astrobwt_dero_test_out[160] = {
+    0x7E, 0x88, 0x44, 0xF2, 0xD6, 0xB7, 0xA4, 0x34, 0x98, 0xFE, 0x6D, 0x22, 0x65, 0x27, 0x68, 0x90,
+    0x23, 0xDA, 0x8A, 0x52, 0xF9, 0xFC, 0x4E, 0xC6, 0x9E, 0x5A, 0xAA, 0xA6, 0x3E, 0xDC, 0xE1, 0xC1,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+#endif
+
+
 } // namespace xmrig


--- a/src/crypto/common/Algorithm.cpp
+++ b/src/crypto/common/Algorithm.cpp
@ -124,6 +124,9 @@ static AlgoName const algorithm_names[] = {
    { "chukwa",                    nullptr,            Algorithm::AR2_CHUKWA      },
    { "argon2/wrkz",               nullptr,            Algorithm::AR2_WRKZ        },
 #   endif
+#   ifdef XMRIG_ALGO_ASTROBWT
+    { "astrobwt/dero",             nullptr,            Algorithm::ASTROBWT_DERO   },
+#   endif
 };


@ -210,6 +213,18 @@ size_t xmrig::Algorithm::l3() const
    }
 #   endif

+#   ifdef XMRIG_ALGO_ASTROBWT
+    if (f == ASTROBWT) {
+        switch (m_id) {
+        case ASTROBWT_DERO:
+            return oneMiB * 20;
+
+        default:
+            break;
+        }
+    }
+#   endif
+
    return 0;
 }

@ -228,6 +243,12 @@ uint32_t xmrig::Algorithm::maxIntensity() const
    }
 #   endif

+#   ifdef XMRIG_ALGO_ASTROBWT
+    if (family() == ASTROBWT) {
+        return 1;
+    }
+#   endif
+
 #   ifdef XMRIG_ALGO_CN_GPU
    if (m_id == CN_GPU) {
        return 1;
@ -291,6 +312,11 @@ xmrig::Algorithm::Family xmrig::Algorithm::family(Id id)
        return ARGON2;
 #   endif

+#   ifdef XMRIG_ALGO_ASTROBWT
+    case ASTROBWT_DERO:
+        return ASTROBWT;
+#   endif
+
    default:
        break;
    }
--- a/src/crypto/common/Algorithm.h
+++ b/src/crypto/common/Algorithm.h
@ -71,6 +71,7 @@ public:
        RX_SFX,        // "rx/sfx"           RandomSFX (Safex Cash).
        AR2_CHUKWA,    // "argon2/chukwa"    Argon2id (Chukwa).
        AR2_WRKZ,      // "argon2/wrkz"      Argon2id (WRKZ)
+        ASTROBWT_DERO, // "astrobwt/dero"    AstroBWT (Dero)
        MAX
    };

@ -81,7 +82,8 @@ public:
        CN_HEAVY,
        CN_PICO,
        RANDOM_X,
-        ARGON2
+        ARGON2,
+        ASTROBWT
    };

    inline Algorithm() = default;
--- a/src/crypto/common/Coin.cpp
+++ b/src/crypto/common/Coin.cpp
@ -50,7 +50,8 @@ static CoinName const coin_names[] = {
    { "monero",     Coin::MONERO },
    { "xmr",        Coin::MONERO },
    { "arqma",      Coin::ARQMA  },
-    { "arq",        Coin::ARQMA  }
+    { "arq",        Coin::ARQMA  },
+    { "dero",       Coin::DERO   },
 };


@ -67,6 +68,9 @@ xmrig::Algorithm::Id xmrig::Coin::algorithm(uint8_t blobVersion) const
    case ARQMA:
        return (blobVersion >= 15) ? Algorithm::RX_ARQ : Algorithm::CN_PICO_0;

+    case DERO:
+        return (blobVersion >= 4) ? Algorithm::ASTROBWT_DERO : Algorithm::CN_0;
+
    case INVALID:
        break;
    }
--- a/src/crypto/common/Coin.h
+++ b/src/crypto/common/Coin.h
@ -40,7 +40,8 @@ public:
    enum Id : int {
        INVALID = -1,
        MONERO,
-        ARQMA
+        ARQMA,
+        DERO
    };