Squashed 'src/secp256k1/' changes from 50cc6ab..1897b8e

1897b8e Merge pull request #229 efc571c Add simple testcases for signing with rfc6979 extra entropy. 1573a10 Add ability to pass extra entropy to rfc6979 3087bc4 Merge pull request #228 d9b9f11 Merge pull request #218 0065a8f Eliminate multiple-returns from secp256k1.c. 354ffa3 Make secp256k1_ec_pubkey_create reject oversized secrets. 27bc131 Silence some warnings from pedantic static analysis tools, improve compatibility with C++. 3b7ea63 Merge pull request #221 f789c5b Merge pull request #215 4bc273b Merge pull request #222 137a8ec Merge pull request #216 7c3771d Disable overlength-strings warnings. 8956111 use 128-bit hex seed 02efd06 Use RFC6979 for test PRNGs ae55e85 Use faster byteswapping and avoid alignment-increasing casts. 443cd4b Get rid of hex format and some binary conversions 0bada0e Merge #214: Improve signing API documentation & specification 8030d7c Improve signing API documentation & specification 7b2fc1c Merge #213: Removed gotos, which are hard to trace and maintain. 11690d3 Removed gotos, which are hard to trace and maintain. 122a1ec Merge pull request #205 035406d Merge pull request #206 2d4cd53 Merge pull request #161 34b898d Additional comments for the testing PRNG and a seeding fix. 6efd6e7 Some comments explaining some of the constants in the code. ffccfd2 x86_64 assembly optimization for scalar_4x64 67cbdf0 Merge pull request #207 039723d Benchmarks for all internal operations 6cc8425 Include a comment on secp256k1_ecdsa_sign explaining low-s. f88343f Merge pull request #203 d61e899 Add group operation counts 2473f17 Merge pull request #202 b5bbce6 Some readme updates, e.g. removal of the GMP field. f0d851e Merge pull request #201 a0ea884 Merge pull request #200 f735446 Convert the rest of the codebase to C89. bf2e1ac Convert tests to C89. (also fixes a use of bare "inline" in field) fc8285f Merge pull request #199 fff412e Merge pull request #197 4be8d6f Centralize the definition of uint128_t and use it uniformly. d9543c9 Switch scalar code to C89. fcc48c4 Remove the non-storage cmov 55422b6 Switch ecmult_gen to use storage types 41f8455 Use group element storage type in EC multiplications e68d720 Add group element storage type ff889f7 Field storage type 7137be8 Merge pull request #196 0768bd5 Get rid of variable-length hex string conversions e84e761 Merge pull request #195 792bcdb Covert several more files to C89. 45cdf44 Merge pull request #193 17db09e Merge pull request #194 402878a fix ifdef/ifndef 25b35c7 Convert field code to strict C89 (+ long long, +__int128) 3627437 C89 nits and dead code removal. a9f350d Merge pull request #191 4732d26 Convert the field/group/ecdsa constant initialization to static consts 19f3e76 Remove unused secp256k1_fe_inner_{start, stop} functions f1ebfe3 Convert the scalar constant initialization to static consts git-subtree-dir: src/secp256k1 git-subtree-split: 1897b8e90bbbdcd919427c9a8ae35b420e919d8f
9 years ago · 9d09322b41
38 changed files with 2525 additions and 1465 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -49,7 +49,7 @@ libsecp256k1_la_LIBADD = $(SECP_LIBS)

 noinst_PROGRAMS =
 if USE_BENCHMARK
-noinst_PROGRAMS += bench_verify bench_recover bench_sign bench_inv
+noinst_PROGRAMS += bench_verify bench_recover bench_sign bench_internal
 bench_verify_SOURCES = src/bench_verify.c
 bench_verify_LDADD = libsecp256k1.la $(SECP_LIBS)
 bench_verify_LDFLAGS = -static
@ -59,10 +59,10 @@ bench_recover_LDFLAGS = -static
 bench_sign_SOURCES = src/bench_sign.c
 bench_sign_LDADD = libsecp256k1.la $(SECP_LIBS)
 bench_sign_LDFLAGS = -static
-bench_inv_SOURCES = src/bench_inv.c
-bench_inv_LDADD = $(SECP_LIBS)
-bench_inv_LDFLAGS = -static
-bench_inv_CPPFLAGS = $(SECP_INCLUDES)
+bench_internal_SOURCES = src/bench_internal.c
+bench_internal_LDADD = $(SECP_LIBS)
+bench_internal_LDFLAGS = -static
+bench_internal_CPPFLAGS = $(SECP_INCLUDES)
 endif

 if USE_TESTS
--- a/README.md
+++ b/README.md
@ -5,25 +5,29 @@ libsecp256k1

 Optimized C library for EC operations on curve secp256k1.

-This library is experimental, so use at your own risk.
+This library is a work in progress and is being used to research best practices. Use at your own risk.

 Features:
-* Low-level field and group operations on secp256k1.
-* ECDSA signing/verification and key generation.
+* secp256k1 ECDSA signing/verification and key generation.
 * Adding/multiplying private/public keys.
 * Serialization/parsing of private keys, public keys, signatures.
+* Constant time, constant memory access signing and pubkey generation.
+* Derandomized DSA (via RFC6979 or with a caller provided function.)
 * Very efficient implementation.

 Implementation details
 ----------------------

 * General
-  * Avoid dynamic memory usage almost everywhere.
+  * No runtime heap allocation.
+  * Extensive testing infrastructure.
+  * Structured to facilitate review and analysis.
+  * Intended to be portable to any system with a C89 compiler and uint64_t support.
+  * Expose only higher level interfaces to minimize the API surface and improve application security. ("Be difficult to use insecurely.")
 * Field operations
  * Optimized implementation of arithmetic modulo the curve's field size (2^256 - 0x1000003D1).
    * Using 5 52-bit limbs (including hand-optimized assembly for x86_64, by Diederik Huys).
    * Using 10 26-bit limbs.
-    * Using GMP.
  * Field inverses and square roots using a sliding window over blocks of 1s (by Peter Dettman).
 * Scalar operations
  * Optimized implementation without data-dependent branches of arithmetic modulo the curve's order.
@ -33,14 +37,15 @@ Implementation details
  * Point addition formula specifically simplified for the curve equation (y^2 = x^3 + 7).
  * Use addition between points in Jacobian and affine coordinates where possible.
  * Use a unified addition/doubling formula where necessary to avoid data-dependent branches.
+  * Point/x comparison without a field inversion by comparison in the Jacobian coordinate space.
 * Point multiplication for verification (a*P + b*G).
  * Use wNAF notation for point multiplicands.
  * Use a much larger window for multiples of G, using precomputed multiples.
  * Use Shamir's trick to do the multiplication with the public key and the generator simultaneously.
-  * Optionally use secp256k1's efficiently-computable endomorphism to split the multiplicands into 4 half-sized ones first.
+  * Optionally (off by default) use secp256k1's efficiently-computable endomorphism to split the P multiplicand into 2 half-sized ones.
 * Point multiplication for signing
  * Use a precomputed table of multiples of powers of 16 multiplied with the generator, so general multiplication becomes a series of additions.
-  * Slice the precomputed table in memory per byte, so memory access to the table becomes uniform.
+  * Access the table with branch-free conditional moves so memory access is uniform.
  * No data-dependent branches
  * The precomputed tables add and eventually subtract points for which no known scalar (private key) is known, preventing even an attacker with control over the private key used to control the data internally.

@ -52,4 +57,5 @@ libsecp256k1 is built using autotools:
    $ ./autogen.sh
    $ ./configure
    $ make
+    $ ./tests
    $ sudo make install  # optional
--- a/configure.ac
+++ b/configure.ac
@ -5,7 +5,7 @@ AC_CONFIG_MACRO_DIR([build-aux/m4])
 AC_CANONICAL_HOST
 AH_TOP([#ifndef LIBSECP256K1_CONFIG_H])
 AH_TOP([#define LIBSECP256K1_CONFIG_H])
-AH_BOTTOM([#endif //LIBSECP256K1_CONFIG_H])
+AH_BOTTOM([#endif /*LIBSECP256K1_CONFIG_H*/])
 AM_INIT_AUTOMAKE([foreign subdir-objects])
 LT_INIT

@ -22,9 +22,9 @@ if test "x$CFLAGS" = "x"; then
  CFLAGS="-O3 -g"
 fi

-AC_PROG_CC_C99
-if test x"$ac_cv_prog_cc_c99" = x"no"; then
-  AC_MSG_ERROR([c99 compiler support required])
+AC_PROG_CC_C89
+if test x"$ac_cv_prog_cc_c89" = x"no"; then
+  AC_MSG_ERROR([c89 compiler support required])
 fi

 case $host in
@ -70,7 +70,7 @@ esac

 CFLAGS="$CFLAGS -W"

-warn_CFLAGS="-Wall -Wextra -Wcast-align -Wnested-externs -Wshadow -Wstrict-prototypes -Wno-unused-function"
+warn_CFLAGS="-std=c89 -pedantic -Wall -Wextra -Wcast-align -Wnested-externs -Wshadow -Wstrict-prototypes -Wno-unused-function -Wno-long-long -Wno-overlength-strings"
 saved_CFLAGS="$CFLAGS"
 CFLAGS="$CFLAGS $warn_CFLAGS"
 AC_MSG_CHECKING([if ${CC} supports ${warn_CFLAGS}])
@ -305,6 +305,8 @@ if test x"$use_endomorphism" = x"yes"; then
  AC_DEFINE(USE_ENDOMORPHISM, 1, [Define this symbol to use endomorphism optimization])
 fi

+AC_C_BIGENDIAN()
+
 AC_MSG_NOTICE([Using assembly optimizations: $set_asm])
 AC_MSG_NOTICE([Using field implementation: $set_field])
 AC_MSG_NOTICE([Using bignum implementation: $set_bignum])
--- a/include/secp256k1.h
+++ b/include/secp256k1.h
@ -78,7 +78,7 @@ SECP256K1_WARN_UNUSED_RESULT int secp256k1_ecdsa_verify(
 ) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(2) SECP256K1_ARG_NONNULL(4);

 /** A pointer to a function to deterministically generate a nonce.
- * Returns: 1 if a nonce was succesfully generated. 0 will cause signing to fail.
+ * Returns: 1 if a nonce was successfully generated. 0 will cause signing to fail.
 * In:      msg32:     the 32-byte message hash being verified (will not be NULL)
 *          key32:     pointer to a 32-byte secret key (will not be NULL)
 *          attempt:   how many iterations we have tried to find a nonce.
@ -97,7 +97,10 @@ typedef int (*secp256k1_nonce_function_t)(
  const void *data
 );

-/** An implementation of RFC6979 (using HMAC-SHA256) as nonce generation function. */
+/** An implementation of RFC6979 (using HMAC-SHA256) as nonce generation function.
+ * If a data pointer is passed, it is assumed to be a pointer to 32 bytes of
+ * extra entropy.
+ */
 extern const secp256k1_nonce_function_t secp256k1_nonce_function_rfc6979;

 /** A default safe nonce generation function (currently equal to secp256k1_nonce_function_rfc6979). */
@ -106,15 +109,43 @@ extern const secp256k1_nonce_function_t secp256k1_nonce_function_default;

 /** Create an ECDSA signature.
 *  Returns: 1: signature created
- *           0: the nonce generation function failed
+ *           0: the nonce generation function failed, the private key was invalid, or there is not
+ *              enough space in the signature (as indicated by siglen).
 *  In:      msg32:  the 32-byte message hash being signed (cannot be NULL)
- *           seckey: pointer to a 32-byte secret key (cannot be NULL, assumed to be valid)
+ *           seckey: pointer to a 32-byte secret key (cannot be NULL)
 *           noncefp:pointer to a nonce generation function. If NULL, secp256k1_nonce_function_default is used
 *           ndata:  pointer to arbitrary data used by the nonce generation function (can be NULL)
 *  Out:     sig:    pointer to an array where the signature will be placed (cannot be NULL)
 *  In/Out:  siglen: pointer to an int with the length of sig, which will be updated
- *                   to contain the actual signature length (<=72).
+ *                   to contain the actual signature length (<=72). If 0 is returned, this will be
+ *                   set to zero.
 * Requires starting using SECP256K1_START_SIGN.
+ *
+ * The sig always has an s value in the lower half of the range (From 0x1
+ * to 0x7FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF5D576E7357A4501DDFE92F46681B20A0,
+ * inclusive), unlike many other implementations.
+ * With ECDSA a third-party can can forge a second distinct signature
+ * of the same message given a single initial signature without knowing
+ * the key by setting s to its additive inverse mod-order, 'flipping' the
+ * sign of the random point R which is not included in the signature.
+ * Since the forgery is of the same message this isn't universally
+ * problematic, but in systems where message malleability or uniqueness
+ * of signatures is important this can cause issues.  This forgery can be
+ * blocked by all verifiers forcing signers to use a canonical form. The
+ * lower-S form reduces the size of signatures slightly on average when
+ * variable length encodings (such as DER) are used and is cheap to
+ * verify, making it a good choice. Security of always using lower-S is
+ * assured because anyone can trivially modify a signature after the
+ * fact to enforce this property.  Adjusting it inside the signing
+ * function avoids the need to re-serialize or have curve specific
+ * constants outside of the library.  By always using a canonical form
+ * even in applications where it isn't needed it becomes possible to
+ * impose a requirement later if a need is discovered.
+ * No other forms of ECDSA malleability are known and none seem likely,
+ * but there is no formal proof that ECDSA, even with this additional
+ * restriction, is free of other malleability.  Commonly used serialization
+ * schemes will also accept various non-unique encodings, so care should
+ * be taken when this property is required for an application.
 */
 int secp256k1_ecdsa_sign(
  const unsigned char *msg32,
@ -127,12 +158,13 @@ int secp256k1_ecdsa_sign(

 /** Create a compact ECDSA signature (64 byte + recovery id).
 *  Returns: 1: signature created
- *           0: the nonce generation function failed
+ *           0: the nonce generation function failed, or the secret key was invalid.
 *  In:      msg32:  the 32-byte message hash being signed (cannot be NULL)
- *           seckey: pointer to a 32-byte secret key (cannot be NULL, assumed to be valid)
+ *           seckey: pointer to a 32-byte secret key (cannot be NULL)
 *           noncefp:pointer to a nonce generation function. If NULL, secp256k1_nonce_function_default is used
 *           ndata:  pointer to arbitrary data used by the nonce generation function (can be NULL)
 *  Out:     sig:    pointer to a 64-byte array where the signature will be placed (cannot be NULL)
+ *                   In case 0 is returned, the returned signature length will be zero.
 *           recid:  pointer to an int, which will be updated to contain the recovery id (can be NULL)
 * Requires starting using SECP256K1_START_SIGN.
 */
--- a/src/bench.h
+++ b/src/bench.h
@ -17,21 +17,40 @@ static double gettimedouble(void) {
    return tv.tv_usec * 0.000001 + tv.tv_sec;
 }

-void run_benchmark(void (*benchmark)(void*), void (*setup)(void*), void (*teardown)(void*), void* data, int count, int iter) {
+void print_number(double x) {
+    double y = x;
+    int c = 0;
+    if (y < 0.0) y = -y;
+    while (y < 100.0) {
+        y *= 10.0;
+        c++;
+    }
+    printf("%.*f", c, x);
+}
+
+void run_benchmark(char *name, void (*benchmark)(void*), void (*setup)(void*), void (*teardown)(void*), void* data, int count, int iter) {
+    int i;
    double min = HUGE_VAL;
    double sum = 0.0;
    double max = 0.0;
-    for (int i = 0; i < count; i++) {
+    for (i = 0; i < count; i++) {
+        double begin, total;
        if (setup) setup(data);
-        double begin = gettimedouble();
+        begin = gettimedouble();
        benchmark(data);
-        double total = gettimedouble() - begin;
+        total = gettimedouble() - begin;
        if (teardown) teardown(data);
        if (total < min) min = total;
        if (total > max) max = total;
        sum += total;
    }
-    printf("min %.3fus / avg %.3fus / max %.3fus\n", min * 1000000.0 / iter, (sum / count) * 1000000.0 / iter, max * 1000000.0 / iter);
+    printf("%s: min ", name);
+    print_number(min * 1000000.0 / iter);
+    printf("us / avg ");
+    print_number((sum / count) * 1000000.0 / iter);
+    printf("us / avg ");
+    print_number(max * 1000000.0 / iter);
+    printf("us\n");
 }

 #endif
--- a/src/bench_internal.c
+++ b/src/bench_internal.c
@ -0,0 +1,318 @@
+/**********************************************************************
+ * Copyright (c) 2014-2015 Pieter Wuille                              *
+ * Distributed under the MIT software license, see the accompanying   *
+ * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
+ **********************************************************************/
+#include <stdio.h>
+
+#include "include/secp256k1.h"
+
+#include "util.h"
+#include "hash_impl.h"
+#include "num_impl.h"
+#include "field_impl.h"
+#include "group_impl.h"
+#include "scalar_impl.h"
+#include "ecmult_impl.h"
+#include "bench.h"
+
+typedef struct {
+    secp256k1_scalar_t scalar_x, scalar_y;
+    secp256k1_fe_t fe_x, fe_y;
+    secp256k1_ge_t ge_x, ge_y;
+    secp256k1_gej_t gej_x, gej_y;
+    unsigned char data[32];
+    int wnaf[256];
+} bench_inv_t;
+
+void bench_setup(void* arg) {
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    static const unsigned char init_x[32] = {
+        0x02, 0x03, 0x05, 0x07, 0x0b, 0x0d, 0x11, 0x13,
+        0x17, 0x1d, 0x1f, 0x25, 0x29, 0x2b, 0x2f, 0x35,
+        0x3b, 0x3d, 0x43, 0x47, 0x49, 0x4f, 0x53, 0x59,
+        0x61, 0x65, 0x67, 0x6b, 0x6d, 0x71, 0x7f, 0x83
+    };
+
+    static const unsigned char init_y[32] = {
+        0x82, 0x83, 0x85, 0x87, 0x8b, 0x8d, 0x81, 0x83,
+        0x97, 0xad, 0xaf, 0xb5, 0xb9, 0xbb, 0xbf, 0xc5,
+        0xdb, 0xdd, 0xe3, 0xe7, 0xe9, 0xef, 0xf3, 0xf9,
+        0x11, 0x15, 0x17, 0x1b, 0x1d, 0xb1, 0xbf, 0xd3
+    };
+
+    secp256k1_scalar_set_b32(&data->scalar_x, init_x, NULL);
+    secp256k1_scalar_set_b32(&data->scalar_y, init_y, NULL);
+    secp256k1_fe_set_b32(&data->fe_x, init_x);
+    secp256k1_fe_set_b32(&data->fe_y, init_y);
+    CHECK(secp256k1_ge_set_xo_var(&data->ge_x, &data->fe_x, 0));
+    CHECK(secp256k1_ge_set_xo_var(&data->ge_y, &data->fe_y, 1));
+    secp256k1_gej_set_ge(&data->gej_x, &data->ge_x);
+    secp256k1_gej_set_ge(&data->gej_y, &data->ge_y);
+    memcpy(data->data, init_x, 32);
+}
+
+void bench_scalar_add(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 2000000; i++) {
+        secp256k1_scalar_add(&data->scalar_x, &data->scalar_x, &data->scalar_y);
+    }
+}
+
+void bench_scalar_negate(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 2000000; i++) {
+        secp256k1_scalar_negate(&data->scalar_x, &data->scalar_x);
+    }
+}
+
+void bench_scalar_sqr(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 200000; i++) {
+        secp256k1_scalar_sqr(&data->scalar_x, &data->scalar_x);
+    }
+}
+
+void bench_scalar_mul(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 200000; i++) {
+        secp256k1_scalar_mul(&data->scalar_x, &data->scalar_x, &data->scalar_y);
+    }
+}
+
+#ifdef USE_ENDOMORPHISM
+void bench_scalar_split(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 20000; i++) {
+        secp256k1_scalar_t l, r;
+        secp256k1_scalar_split_lambda_var(&l, &r, &data->scalar_x);
+        secp256k1_scalar_add(&data->scalar_x, &data->scalar_x, &data->scalar_y);
+    }
+}
+#endif
+
+void bench_scalar_inverse(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 2000; i++) {
+        secp256k1_scalar_inverse(&data->scalar_x, &data->scalar_x);
+        secp256k1_scalar_add(&data->scalar_x, &data->scalar_x, &data->scalar_y);
+    }
+}
+
+void bench_scalar_inverse_var(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 2000; i++) {
+        secp256k1_scalar_inverse_var(&data->scalar_x, &data->scalar_x);
+        secp256k1_scalar_add(&data->scalar_x, &data->scalar_x, &data->scalar_y);
+    }
+}
+
+void bench_field_normalize(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 2000000; i++) {
+        secp256k1_fe_normalize(&data->fe_x);
+    }
+}
+
+void bench_field_normalize_weak(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 2000000; i++) {
+        secp256k1_fe_normalize_weak(&data->fe_x);
+    }
+}
+
+void bench_field_mul(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 200000; i++) {
+        secp256k1_fe_mul(&data->fe_x, &data->fe_x, &data->fe_y);
+    }
+}
+
+void bench_field_sqr(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 200000; i++) {
+        secp256k1_fe_sqr(&data->fe_x, &data->fe_x);
+    }
+}
+
+void bench_field_inverse(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 20000; i++) {
+        secp256k1_fe_inv(&data->fe_x, &data->fe_x);
+        secp256k1_fe_add(&data->fe_x, &data->fe_y);
+    }
+}
+
+void bench_field_inverse_var(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 20000; i++) {
+        secp256k1_fe_inv_var(&data->fe_x, &data->fe_x);
+        secp256k1_fe_add(&data->fe_x, &data->fe_y);
+    }
+}
+
+void bench_field_sqrt_var(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 20000; i++) {
+        secp256k1_fe_sqrt_var(&data->fe_x, &data->fe_x);
+        secp256k1_fe_add(&data->fe_x, &data->fe_y);
+    }
+}
+
+void bench_group_double_var(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 200000; i++) {
+        secp256k1_gej_double_var(&data->gej_x, &data->gej_x);
+    }
+}
+
+void bench_group_add_var(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 200000; i++) {
+        secp256k1_gej_add_var(&data->gej_x, &data->gej_x, &data->gej_y);
+    }
+}
+
+void bench_group_add_affine(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 200000; i++) {
+        secp256k1_gej_add_ge(&data->gej_x, &data->gej_x, &data->ge_y);
+    }
+}
+
+void bench_group_add_affine_var(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 200000; i++) {
+        secp256k1_gej_add_ge_var(&data->gej_x, &data->gej_x, &data->ge_y);
+    }
+}
+
+void bench_ecmult_wnaf(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 20000; i++) {
+        secp256k1_ecmult_wnaf(data->wnaf, &data->scalar_x, WINDOW_A);
+        secp256k1_scalar_add(&data->scalar_x, &data->scalar_x, &data->scalar_y);
+    }
+}
+
+
+void bench_sha256(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+    secp256k1_sha256_t sha;
+
+    for (i = 0; i < 20000; i++) {
+        secp256k1_sha256_initialize(&sha);
+        secp256k1_sha256_write(&sha, data->data, 32);
+        secp256k1_sha256_finalize(&sha, data->data);
+    }
+}
+
+void bench_hmac_sha256(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+    secp256k1_hmac_sha256_t hmac;
+
+    for (i = 0; i < 20000; i++) {
+        secp256k1_hmac_sha256_initialize(&hmac, data->data, 32);
+        secp256k1_hmac_sha256_write(&hmac, data->data, 32);
+        secp256k1_hmac_sha256_finalize(&hmac, data->data);
+    }
+}
+
+void bench_rfc6979_hmac_sha256(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+    secp256k1_rfc6979_hmac_sha256_t rng;
+
+    for (i = 0; i < 20000; i++) {
+        secp256k1_rfc6979_hmac_sha256_initialize(&rng, data->data, 32, data->data, 32, NULL, 0);
+        secp256k1_rfc6979_hmac_sha256_generate(&rng, data->data, 32);
+    }
+}
+
+
+int have_flag(int argc, char** argv, char *flag) {
+    char** argm = argv + argc;
+    argv++;
+    if (argv == argm) {
+        return 1;
+    }
+    while (argv != NULL && argv != argm) {
+        if (strcmp(*argv, flag) == 0) return 1;
+        argv++;
+    }
+    return 0;
+}
+
+int main(int argc, char **argv) {
+    bench_inv_t data;
+    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "add")) run_benchmark("scalar_add", bench_scalar_add, bench_setup, NULL, &data, 10, 2000000);
+    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "negate")) run_benchmark("scalar_negate", bench_scalar_negate, bench_setup, NULL, &data, 10, 2000000);
+    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "sqr")) run_benchmark("scalar_sqr", bench_scalar_sqr, bench_setup, NULL, &data, 10, 200000);
+    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "mul")) run_benchmark("scalar_mul", bench_scalar_mul, bench_setup, NULL, &data, 10, 200000);
+#ifdef USE_ENDOMORPHISM
+    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "split")) run_benchmark("scalar_split", bench_scalar_split, bench_setup, NULL, &data, 10, 20000);
+#endif
+    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "inverse")) run_benchmark("scalar_inverse", bench_scalar_inverse, bench_setup, NULL, &data, 10, 2000);
+    if (have_flag(argc, argv, "scalar") || have_flag(argc, argv, "inverse")) run_benchmark("scalar_inverse_var", bench_scalar_inverse_var, bench_setup, NULL, &data, 10, 2000);
+
+    if (have_flag(argc, argv, "field") || have_flag(argc, argv, "normalize")) run_benchmark("field_normalize", bench_field_normalize, bench_setup, NULL, &data, 10, 2000000);
+    if (have_flag(argc, argv, "field") || have_flag(argc, argv, "normalize")) run_benchmark("field_normalize_weak", bench_field_normalize_weak, bench_setup, NULL, &data, 10, 2000000);
+    if (have_flag(argc, argv, "field") || have_flag(argc, argv, "sqr")) run_benchmark("field_sqr", bench_field_sqr, bench_setup, NULL, &data, 10, 200000);
+    if (have_flag(argc, argv, "field") || have_flag(argc, argv, "mul")) run_benchmark("field_mul", bench_field_mul, bench_setup, NULL, &data, 10, 200000);
+    if (have_flag(argc, argv, "field") || have_flag(argc, argv, "inverse")) run_benchmark("field_inverse", bench_field_inverse, bench_setup, NULL, &data, 10, 20000);
+    if (have_flag(argc, argv, "field") || have_flag(argc, argv, "inverse")) run_benchmark("field_inverse_var", bench_field_inverse_var, bench_setup, NULL, &data, 10, 20000);
+    if (have_flag(argc, argv, "field") || have_flag(argc, argv, "sqrt")) run_benchmark("field_sqrt_var", bench_field_sqrt_var, bench_setup, NULL, &data, 10, 20000);
+
+    if (have_flag(argc, argv, "group") || have_flag(argc, argv, "double")) run_benchmark("group_double_var", bench_group_double_var, bench_setup, NULL, &data, 10, 200000);
+    if (have_flag(argc, argv, "group") || have_flag(argc, argv, "add")) run_benchmark("group_add_var", bench_group_add_var, bench_setup, NULL, &data, 10, 200000);
+    if (have_flag(argc, argv, "group") || have_flag(argc, argv, "add")) run_benchmark("group_add_affine", bench_group_add_affine, bench_setup, NULL, &data, 10, 200000);
+    if (have_flag(argc, argv, "group") || have_flag(argc, argv, "add")) run_benchmark("group_add_affine_var", bench_group_add_affine_var, bench_setup, NULL, &data, 10, 200000);
+
+    if (have_flag(argc, argv, "ecmult") || have_flag(argc, argv, "wnaf")) run_benchmark("ecmult_wnaf", bench_ecmult_wnaf, bench_setup, NULL, &data, 10, 20000);
+
+    if (have_flag(argc, argv, "hash") || have_flag(argc, argv, "sha256")) run_benchmark("hash_sha256", bench_sha256, bench_setup, NULL, &data, 10, 20000);
+    if (have_flag(argc, argv, "hash") || have_flag(argc, argv, "hmac")) run_benchmark("hash_hmac_sha256", bench_hmac_sha256, bench_setup, NULL, &data, 10, 20000);
+    if (have_flag(argc, argv, "hash") || have_flag(argc, argv, "rng6979")) run_benchmark("hash_rfc6979_hmac_sha256", bench_rfc6979_hmac_sha256, bench_setup, NULL, &data, 10, 20000);
+    return 0;
+}
--- a/src/bench_inv.c
+++ b/src/bench_inv.c
@ -1,52 +0,0 @@
-/**********************************************************************
- * Copyright (c) 2014 Pieter Wuille                                   *
- * Distributed under the MIT software license, see the accompanying   *
- * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
- **********************************************************************/
-#include <stdio.h>
-
-#include "include/secp256k1.h"
-
-#include "util.h"
-#include "num_impl.h"
-#include "field_impl.h"
-#include "group_impl.h"
-#include "scalar_impl.h"
-#include "bench.h"
-
-typedef struct {
-    secp256k1_scalar_t base, x;
-} bench_inv_t;
-
-void bench_inv_setup(void* arg) {
-    bench_inv_t *data = (bench_inv_t*)arg;
-
-    static const unsigned char init[32] = {
-        0x02, 0x03, 0x05, 0x07, 0x0b, 0x0d, 0x11, 0x13,
-        0x17, 0x1d, 0x1f, 0x25, 0x29, 0x2b, 0x2f, 0x35,
-        0x3b, 0x3d, 0x43, 0x47, 0x49, 0x4f, 0x53, 0x59,
-        0x61, 0x65, 0x67, 0x6b, 0x6d, 0x71, 0x7f, 0x83
-    };
-
-    secp256k1_scalar_set_b32(&data->base, init, NULL);
-    secp256k1_scalar_set_b32(&data->x, init, NULL);
-}
-
-void bench_inv(void* arg) {
-    bench_inv_t *data = (bench_inv_t*)arg;
-
-    for (int i=0; i<20000; i++) {
-        secp256k1_scalar_inverse(&data->x, &data->x);
-        secp256k1_scalar_add(&data->x, &data->x, &data->base);
-    }
-}
-
-int main(void) {
-    secp256k1_ge_start();
-
-    bench_inv_t data;
-    run_benchmark(bench_inv, bench_inv_setup, NULL, &data, 10, 20000);
-
-    secp256k1_ge_stop();
-    return 0;
-}
--- a/src/bench_recover.c
+++ b/src/bench_recover.c
@ -14,13 +14,15 @@ typedef struct {
 } bench_recover_t;

 void bench_recover(void* arg) {
+    int i;
    bench_recover_t *data = (bench_recover_t*)arg;
-
    unsigned char pubkey[33];
-    for (int i=0; i<20000; i++) {
+
+    for (i = 0; i < 20000; i++) {
+        int j;
        int pubkeylen = 33;
        CHECK(secp256k1_ecdsa_recover_compact(data->msg, data->sig, pubkey, &pubkeylen, 1, i % 2));
-        for (int j = 0; j < 32; j++) {
+        for (j = 0; j < 32; j++) {
            data->sig[j + 32] = data->msg[j];    /* Move former message to S. */
            data->msg[j] = data->sig[j];         /* Move former R to message. */
            data->sig[j] = pubkey[j + 1];        /* Move recovered pubkey X coordinate to R (which must be a valid X coordinate). */
@ -29,17 +31,18 @@ void bench_recover(void* arg) {
 }

 void bench_recover_setup(void* arg) {
+    int i;
    bench_recover_t *data = (bench_recover_t*)arg;

-    for (int i = 0; i < 32; i++) data->msg[i] = 1 + i;
-    for (int i = 0; i < 64; i++) data->sig[i] = 65 + i;
+    for (i = 0; i < 32; i++) data->msg[i] = 1 + i;
+    for (i = 0; i < 64; i++) data->sig[i] = 65 + i;
 }

 int main(void) {
+    bench_recover_t data;
    secp256k1_start(SECP256K1_START_VERIFY);

-    bench_recover_t data;
-    run_benchmark(bench_recover, bench_recover_setup, NULL, &data, 10, 20000);
+    run_benchmark("ecdsa_recover", bench_recover, bench_recover_setup, NULL, &data, 10, 20000);

    secp256k1_stop();
    return 0;
--- a/src/bench_sign.c
+++ b/src/bench_sign.c
@ -14,20 +14,23 @@ typedef struct {
 } bench_sign_t;

 static void bench_sign_setup(void* arg) {
+    int i;
    bench_sign_t *data = (bench_sign_t*)arg;

-    for (int i = 0; i < 32; i++) data->msg[i] = i + 1;
-    for (int i = 0; i < 32; i++) data->key[i] = i + 65;
+    for (i = 0; i < 32; i++) data->msg[i] = i + 1;
+    for (i = 0; i < 32; i++) data->key[i] = i + 65;
 }

 static void bench_sign(void* arg) {
+    int i;
    bench_sign_t *data = (bench_sign_t*)arg;

    unsigned char sig[64];
-    for (int i=0; i<20000; i++) {
+    for (i = 0; i < 20000; i++) {
+        int j;
        int recid = 0;
        CHECK(secp256k1_ecdsa_sign_compact(data->msg, sig, data->key, NULL, NULL, &recid));
-        for (int j = 0; j < 32; j++) {
+        for (j = 0; j < 32; j++) {
            data->msg[j] = sig[j];             /* Move former R to message. */
            data->key[j] = sig[j + 32];        /* Move former S to key.     */
        }
@ -35,10 +38,10 @@ static void bench_sign(void* arg) {
 }

 int main(void) {
+    bench_sign_t data;
    secp256k1_start(SECP256K1_START_SIGN);

-    bench_sign_t data;
-    run_benchmark(bench_sign, bench_sign_setup, NULL, &data, 10, 20000);
+    run_benchmark("ecdsa_sign", bench_sign, bench_sign_setup, NULL, &data, 10, 20000);

    secp256k1_stop();
    return 0;
--- a/src/bench_verify.c
+++ b/src/bench_verify.c
@ -21,9 +21,10 @@ typedef struct {
 } benchmark_verify_t;

 static void benchmark_verify(void* arg) {
+    int i;
    benchmark_verify_t* data = (benchmark_verify_t*)arg;

-    for (int i=0; i<20000; i++) {
+    for (i = 0; i < 20000; i++) {
        data->sig[data->siglen - 1] ^= (i & 0xFF);
        data->sig[data->siglen - 2] ^= ((i >> 8) & 0xFF);
        data->sig[data->siglen - 3] ^= ((i >> 16) & 0xFF);
@ -35,18 +36,19 @@ static void benchmark_verify(void* arg) {
 }

 int main(void) {
-    secp256k1_start(SECP256K1_START_VERIFY | SECP256K1_START_SIGN);
-
+    int i;
    benchmark_verify_t data;

-    for (int i = 0; i < 32; i++) data.msg[i] = 1 + i;
-    for (int i = 0; i < 32; i++) data.key[i] = 33 + i;
+    secp256k1_start(SECP256K1_START_VERIFY | SECP256K1_START_SIGN);
+
+    for (i = 0; i < 32; i++) data.msg[i] = 1 + i;
+    for (i = 0; i < 32; i++) data.key[i] = 33 + i;
    data.siglen = 72;
    secp256k1_ecdsa_sign(data.msg, data.sig, &data.siglen, data.key, NULL, NULL);
    data.pubkeylen = 33;
    CHECK(secp256k1_ec_pubkey_create(data.pubkey, &data.pubkeylen, data.key, 1));

-    run_benchmark(benchmark_verify, NULL, NULL, &data, 10, 20000);
+    run_benchmark("ecdsa_verify", benchmark_verify, NULL, NULL, &data, 10, 20000);

    secp256k1_stop();
    return 0;
--- a/src/ecdsa.h
+++ b/src/ecdsa.h
@ -10,9 +10,6 @@
 #include "scalar.h"
 #include "group.h"

-static void secp256k1_ecsda_start(void);
-static void secp256k1_ecdsa_stop(void);
-
 typedef struct {
    secp256k1_scalar_t r, s;
 } secp256k1_ecdsa_sig_t;
@ -22,6 +19,5 @@ static int secp256k1_ecdsa_sig_serialize(unsigned char *sig, int *size, const se
 static int secp256k1_ecdsa_sig_verify(const secp256k1_ecdsa_sig_t *sig, const secp256k1_ge_t *pubkey, const secp256k1_scalar_t *message);
 static int secp256k1_ecdsa_sig_sign(secp256k1_ecdsa_sig_t *sig, const secp256k1_scalar_t *seckey, const secp256k1_scalar_t *message, const secp256k1_scalar_t *nonce, int *recid);
 static int secp256k1_ecdsa_sig_recover(const secp256k1_ecdsa_sig_t *sig, secp256k1_ge_t *pubkey, const secp256k1_scalar_t *message, int recid);
-static void secp256k1_ecdsa_sig_set_rs(secp256k1_ecdsa_sig_t *sig, const secp256k1_scalar_t *r, const secp256k1_scalar_t *s);

 #endif
--- a/src/ecdsa_impl.h
+++ b/src/ecdsa_impl.h
@ -15,71 +15,69 @@
 #include "ecmult_gen.h"
 #include "ecdsa.h"

-typedef struct {
-    secp256k1_fe_t order_as_fe;
-    secp256k1_fe_t p_minus_order;
-} secp256k1_ecdsa_consts_t;
-
-static const secp256k1_ecdsa_consts_t *secp256k1_ecdsa_consts = NULL;
-
-static void secp256k1_ecdsa_start(void) {
-    if (secp256k1_ecdsa_consts != NULL)
-        return;
-
-    /* Allocate. */
-    secp256k1_ecdsa_consts_t *ret = (secp256k1_ecdsa_consts_t*)checked_malloc(sizeof(secp256k1_ecdsa_consts_t));
-
-    static const unsigned char order[] = {
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFE,
-        0xBA,0xAE,0xDC,0xE6,0xAF,0x48,0xA0,0x3B,
-        0xBF,0xD2,0x5E,0x8C,0xD0,0x36,0x41,0x41
-    };
-
-    secp256k1_fe_set_b32(&ret->order_as_fe, order);
-    secp256k1_fe_negate(&ret->p_minus_order, &ret->order_as_fe, 1);
-    secp256k1_fe_normalize_var(&ret->p_minus_order);
-
-    /* Set the global pointer. */
-    secp256k1_ecdsa_consts = ret;
-}
-
-static void secp256k1_ecdsa_stop(void) {
-    if (secp256k1_ecdsa_consts == NULL)
-        return;
-
-    secp256k1_ecdsa_consts_t *c = (secp256k1_ecdsa_consts_t*)secp256k1_ecdsa_consts;
-    secp256k1_ecdsa_consts = NULL;
-    free(c);
-}
+/** Group order for secp256k1 defined as 'n' in "Standards for Efficient Cryptography" (SEC2) 2.7.1
+ *  sage: for t in xrange(1023, -1, -1):
+ *     ..   p = 2**256 - 2**32 - t
+ *     ..   if p.is_prime():
+ *     ..     print '%x'%p
+ *     ..     break
+ *   'fffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2f'
+ *  sage: a = 0
+ *  sage: b = 7
+ *  sage: F = FiniteField (p)
+ *  sage: '%x' % (EllipticCurve ([F (a), F (b)]).order())
+ *   'fffffffffffffffffffffffffffffffebaaedce6af48a03bbfd25e8cd0364141'
+ */
+static const secp256k1_fe_t secp256k1_ecdsa_const_order_as_fe = SECP256K1_FE_CONST(
+    0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFEUL,
+    0xBAAEDCE6UL, 0xAF48A03BUL, 0xBFD25E8CUL, 0xD0364141UL
+);
+
+/** Difference between field and order, values 'p' and 'n' values defined in
+ *  "Standards for Efficient Cryptography" (SEC2) 2.7.1.
+ *  sage: p = 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2F
+ *  sage: a = 0
+ *  sage: b = 7
+ *  sage: F = FiniteField (p)
+ *  sage: '%x' % (p - EllipticCurve ([F (a), F (b)]).order())
+ *   '14551231950b75fc4402da1722fc9baee'
+ */
+static const secp256k1_fe_t secp256k1_ecdsa_const_p_minus_order = SECP256K1_FE_CONST(
+    0, 0, 0, 1, 0x45512319UL, 0x50B75FC4UL, 0x402DA172UL, 0x2FC9BAEEUL
+);

 static int secp256k1_ecdsa_sig_parse(secp256k1_ecdsa_sig_t *r, const unsigned char *sig, int size) {
+    unsigned char ra[32] = {0}, sa[32] = {0};
+    const unsigned char *rp;
+    const unsigned char *sp;
+    int lenr;
+    int lens;
+    int overflow;
    if (sig[0] != 0x30) return 0;
-    int lenr = sig[3];
+    lenr = sig[3];
    if (5+lenr >= size) return 0;
-    int lens = sig[lenr+5];
+    lens = sig[lenr+5];
    if (sig[1] != lenr+lens+4) return 0;
    if (lenr+lens+6 > size) return 0;
    if (sig[2] != 0x02) return 0;
    if (lenr == 0) return 0;
    if (sig[lenr+4] != 0x02) return 0;
    if (lens == 0) return 0;
-    const unsigned char *sp = sig + 6 + lenr;
+    sp = sig + 6 + lenr;
    while (lens > 0 && sp[0] == 0) {
        lens--;
        sp++;
    }
    if (lens > 32) return 0;
-    const unsigned char *rp = sig + 4;
+    rp = sig + 4;
    while (lenr > 0 && rp[0] == 0) {
        lenr--;
        rp++;
    }
    if (lenr > 32) return 0;
-    unsigned char ra[32] = {0}, sa[32] = {0};
    memcpy(ra + 32 - lenr, rp, lenr);
    memcpy(sa + 32 - lens, sp, lens);
-    int overflow = 0;
+    overflow = 0;
    secp256k1_scalar_set_b32(&r->r, ra, &overflow);
    if (overflow) return 0;
    secp256k1_scalar_set_b32(&r->s, sa, &overflow);
@ -89,10 +87,10 @@ static int secp256k1_ecdsa_sig_parse(secp256k1_ecdsa_sig_t *r, const unsigned ch

 static int secp256k1_ecdsa_sig_serialize(unsigned char *sig, int *size, const secp256k1_ecdsa_sig_t *a) {
    unsigned char r[33] = {0}, s[33] = {0};
-    secp256k1_scalar_get_b32(&r[1], &a->r);
-    secp256k1_scalar_get_b32(&s[1], &a->s);
    unsigned char *rp = r, *sp = s;
    int lenR = 33, lenS = 33;
+    secp256k1_scalar_get_b32(&r[1], &a->r);
+    secp256k1_scalar_get_b32(&s[1], &a->s);
    while (lenR > 1 && rp[0] == 0 && rp[1] < 0x80) { lenR--; rp++; }
    while (lenS > 1 && sp[0] == 0 && sp[1] < 0x80) { lenS--; sp++; }
    if (*size < 6+lenS+lenR)
@ -110,93 +108,100 @@ static int secp256k1_ecdsa_sig_serialize(unsigned char *sig, int *size, const se
 }

 static int secp256k1_ecdsa_sig_verify(const secp256k1_ecdsa_sig_t *sig, const secp256k1_ge_t *pubkey, const secp256k1_scalar_t *message) {
+    unsigned char c[32];
+    secp256k1_scalar_t sn, u1, u2;
+    secp256k1_fe_t xr;
+    secp256k1_gej_t pubkeyj;
+    secp256k1_gej_t pr;
+
    if (secp256k1_scalar_is_zero(&sig->r) || secp256k1_scalar_is_zero(&sig->s))
        return 0;

-    secp256k1_scalar_t sn, u1, u2;
    secp256k1_scalar_inverse_var(&sn, &sig->s);
    secp256k1_scalar_mul(&u1, &sn, message);
    secp256k1_scalar_mul(&u2, &sn, &sig->r);
-    secp256k1_gej_t pubkeyj; secp256k1_gej_set_ge(&pubkeyj, pubkey);
-    secp256k1_gej_t pr; secp256k1_ecmult(&pr, &pubkeyj, &u2, &u1);
+    secp256k1_gej_set_ge(&pubkeyj, pubkey);
+    secp256k1_ecmult(&pr, &pubkeyj, &u2, &u1);
    if (secp256k1_gej_is_infinity(&pr)) {
        return 0;
    }
-    unsigned char c[32];
    secp256k1_scalar_get_b32(c, &sig->r);
-    secp256k1_fe_t xr;
    secp256k1_fe_set_b32(&xr, c);

-    // We now have the recomputed R point in pr, and its claimed x coordinate (modulo n)
-    // in xr. Naively, we would extract the x coordinate from pr (requiring a inversion modulo p),
-    // compute the remainder modulo n, and compare it to xr. However:
-    //
-    //       xr == X(pr) mod n
-    //   <=> exists h. (xr + h * n < p && xr + h * n == X(pr))
-    //   [Since 2 * n > p, h can only be 0 or 1]
-    //   <=> (xr == X(pr)) || (xr + n < p && xr + n == X(pr))
-    //   [In Jacobian coordinates, X(pr) is pr.x / pr.z^2 mod p]
-    //   <=> (xr == pr.x / pr.z^2 mod p) || (xr + n < p && xr + n == pr.x / pr.z^2 mod p)
-    //   [Multiplying both sides of the equations by pr.z^2 mod p]
-    //   <=> (xr * pr.z^2 mod p == pr.x) || (xr + n < p && (xr + n) * pr.z^2 mod p == pr.x)
-    //
-    // Thus, we can avoid the inversion, but we have to check both cases separately.
-    // secp256k1_gej_eq_x implements the (xr * pr.z^2 mod p == pr.x) test.
+    /** We now have the recomputed R point in pr, and its claimed x coordinate (modulo n)
+     *  in xr. Naively, we would extract the x coordinate from pr (requiring a inversion modulo p),
+     *  compute the remainder modulo n, and compare it to xr. However:
+     *
+     *        xr == X(pr) mod n
+     *    <=> exists h. (xr + h * n < p && xr + h * n == X(pr))
+     *    [Since 2 * n > p, h can only be 0 or 1]
+     *    <=> (xr == X(pr)) || (xr + n < p && xr + n == X(pr))
+     *    [In Jacobian coordinates, X(pr) is pr.x / pr.z^2 mod p]
+     *    <=> (xr == pr.x / pr.z^2 mod p) || (xr + n < p && xr + n == pr.x / pr.z^2 mod p)
+     *    [Multiplying both sides of the equations by pr.z^2 mod p]
+     *    <=> (xr * pr.z^2 mod p == pr.x) || (xr + n < p && (xr + n) * pr.z^2 mod p == pr.x)
+     *
+     *  Thus, we can avoid the inversion, but we have to check both cases separately.
+     *  secp256k1_gej_eq_x implements the (xr * pr.z^2 mod p == pr.x) test.
+     */
    if (secp256k1_gej_eq_x_var(&xr, &pr)) {
-        // xr.x == xr * xr.z^2 mod p, so the signature is valid.
+        /* xr.x == xr * xr.z^2 mod p, so the signature is valid. */
        return 1;
    }
-    if (secp256k1_fe_cmp_var(&xr, &secp256k1_ecdsa_consts->p_minus_order) >= 0) {
-        // xr + p >= n, so we can skip testing the second case.
+    if (secp256k1_fe_cmp_var(&xr, &secp256k1_ecdsa_const_p_minus_order) >= 0) {
+        /* xr + p >= n, so we can skip testing the second case. */
        return 0;
    }
-    secp256k1_fe_add(&xr, &secp256k1_ecdsa_consts->order_as_fe);
+    secp256k1_fe_add(&xr, &secp256k1_ecdsa_const_order_as_fe);
    if (secp256k1_gej_eq_x_var(&xr, &pr)) {
-        // (xr + n) * pr.z^2 mod p == pr.x, so the signature is valid.
+        /* (xr + n) * pr.z^2 mod p == pr.x, so the signature is valid. */
        return 1;
    }
    return 0;
 }

 static int secp256k1_ecdsa_sig_recover(const secp256k1_ecdsa_sig_t *sig, secp256k1_ge_t *pubkey, const secp256k1_scalar_t *message, int recid) {
+    unsigned char brx[32];
+    secp256k1_fe_t fx;
+    secp256k1_ge_t x;
+    secp256k1_gej_t xj;
+    secp256k1_scalar_t rn, u1, u2;
+    secp256k1_gej_t qj;
+
    if (secp256k1_scalar_is_zero(&sig->r) || secp256k1_scalar_is_zero(&sig->s))
        return 0;

-    unsigned char brx[32];
    secp256k1_scalar_get_b32(brx, &sig->r);
-    secp256k1_fe_t fx;
    VERIFY_CHECK(secp256k1_fe_set_b32(&fx, brx)); /* brx comes from a scalar, so is less than the order; certainly less than p */
    if (recid & 2) {
-        if (secp256k1_fe_cmp_var(&fx, &secp256k1_ecdsa_consts->p_minus_order) >= 0)
+        if (secp256k1_fe_cmp_var(&fx, &secp256k1_ecdsa_const_p_minus_order) >= 0)
            return 0;
-        secp256k1_fe_add(&fx, &secp256k1_ecdsa_consts->order_as_fe);
+        secp256k1_fe_add(&fx, &secp256k1_ecdsa_const_order_as_fe);
    }
-    secp256k1_ge_t x;
    if (!secp256k1_ge_set_xo_var(&x, &fx, recid & 1))
        return 0;
-    secp256k1_gej_t xj;
    secp256k1_gej_set_ge(&xj, &x);
-    secp256k1_scalar_t rn, u1, u2;
    secp256k1_scalar_inverse_var(&rn, &sig->r);
    secp256k1_scalar_mul(&u1, &rn, message);
    secp256k1_scalar_negate(&u1, &u1);
    secp256k1_scalar_mul(&u2, &rn, &sig->s);
-    secp256k1_gej_t qj;
    secp256k1_ecmult(&qj, &xj, &u2, &u1);
    secp256k1_ge_set_gej_var(pubkey, &qj);
    return !secp256k1_gej_is_infinity(&qj);
 }

 static int secp256k1_ecdsa_sig_sign(secp256k1_ecdsa_sig_t *sig, const secp256k1_scalar_t *seckey, const secp256k1_scalar_t *message, const secp256k1_scalar_t *nonce, int *recid) {
+    unsigned char b[32];
    secp256k1_gej_t rp;
-    secp256k1_ecmult_gen(&rp, nonce);
    secp256k1_ge_t r;
+    secp256k1_scalar_t n;
+    int overflow = 0;
+
+    secp256k1_ecmult_gen(&rp, nonce);
    secp256k1_ge_set_gej(&r, &rp);
-    unsigned char b[32];
    secp256k1_fe_normalize(&r.x);
    secp256k1_fe_normalize(&r.y);
    secp256k1_fe_get_b32(b, &r.x);
-    int overflow = 0;
    secp256k1_scalar_set_b32(&sig->r, b, &overflow);
    if (secp256k1_scalar_is_zero(&sig->r)) {
        /* P.x = order is on the curve, so technically sig->r could end up zero, which would be an invalid signature. */
@ -206,7 +211,6 @@ static int secp256k1_ecdsa_sig_sign(secp256k1_ecdsa_sig_t *sig, const secp256k1_
    }
    if (recid)
        *recid = (overflow ? 2 : 0) | (secp256k1_fe_is_odd(&r.y) ? 1 : 0);
-    secp256k1_scalar_t n;
    secp256k1_scalar_mul(&n, &sig->r, seckey);
    secp256k1_scalar_add(&n, &n, message);
    secp256k1_scalar_inverse(&sig->s, nonce);
@ -224,9 +228,4 @@ static int secp256k1_ecdsa_sig_sign(secp256k1_ecdsa_sig_t *sig, const secp256k1_
    return 1;
 }

-static void secp256k1_ecdsa_sig_set_rs(secp256k1_ecdsa_sig_t *sig, const secp256k1_scalar_t *r, const secp256k1_scalar_t *s) {
-    sig->r = *r;
-    sig->s = *s;
-}
-
 #endif
--- a/src/eckey_impl.h
+++ b/src/eckey_impl.h
@ -51,13 +51,16 @@ static int secp256k1_eckey_pubkey_serialize(secp256k1_ge_t *elem, unsigned char
 }

 static int secp256k1_eckey_privkey_parse(secp256k1_scalar_t *key, const unsigned char *privkey, int privkeylen) {
+    unsigned char c[32] = {0};
    const unsigned char *end = privkey + privkeylen;
+    int lenb = 0;
+    int len = 0;
+    int overflow = 0;
    /* sequence header */
    if (end < privkey+1 || *privkey != 0x30)
        return 0;
    privkey++;
    /* sequence length constructor */
-    int lenb = 0;
    if (end < privkey+1 || !(*privkey & 0x80))
        return 0;
    lenb = *privkey & ~0x80; privkey++;
@ -66,7 +69,6 @@ static int secp256k1_eckey_privkey_parse(secp256k1_scalar_t *key, const unsigned
    if (end < privkey+lenb)
        return 0;
    /* sequence length */
-    int len = 0;
    len = privkey[lenb-1] | (lenb > 1 ? privkey[lenb-2] << 8 : 0);
    privkey += lenb;
    if (end < privkey+len)
@ -78,8 +80,6 @@ static int secp256k1_eckey_privkey_parse(secp256k1_scalar_t *key, const unsigned
    /* sequence element 1: octet string, up to 32 bytes */
    if (end < privkey+2 || privkey[0] != 0x04 || privkey[1] > 0x20 || end < privkey+2+privkey[1])
        return 0;
-    int overflow = 0;
-    unsigned char c[32] = {0};
    memcpy(c + 32 - privkey[1], privkey + 2, privkey[1]);
    secp256k1_scalar_set_b32(key, c, &overflow);
    memset(c, 0, 32);
@ -88,8 +88,9 @@ static int secp256k1_eckey_privkey_parse(secp256k1_scalar_t *key, const unsigned

 static int secp256k1_eckey_privkey_serialize(unsigned char *privkey, int *privkeylen, const secp256k1_scalar_t *key, int compressed) {
    secp256k1_gej_t rp;
-    secp256k1_ecmult_gen(&rp, key);
    secp256k1_ge_t r;
+    int pubkeylen = 0;
+    secp256k1_ecmult_gen(&rp, key);
    secp256k1_ge_set_gej(&r, &rp);
    if (compressed) {
        static const unsigned char begin[] = {
@ -110,7 +111,6 @@ static int secp256k1_eckey_privkey_serialize(unsigned char *privkey, int *privke
        memcpy(ptr, begin, sizeof(begin)); ptr += sizeof(begin);
        secp256k1_scalar_get_b32(ptr, key); ptr += 32;
        memcpy(ptr, middle, sizeof(middle)); ptr += sizeof(middle);
-        int pubkeylen = 0;
        if (!secp256k1_eckey_pubkey_serialize(&r, ptr, &pubkeylen, 1)) {
            return 0;
        }
@ -137,7 +137,6 @@ static int secp256k1_eckey_privkey_serialize(unsigned char *privkey, int *privke
        memcpy(ptr, begin, sizeof(begin)); ptr += sizeof(begin);
        secp256k1_scalar_get_b32(ptr, key); ptr += 32;
        memcpy(ptr, middle, sizeof(middle)); ptr += sizeof(middle);
-        int pubkeylen = 0;
        if (!secp256k1_eckey_pubkey_serialize(&r, ptr, &pubkeylen, 0)) {
            return 0;
        }
@ -156,8 +155,8 @@ static int secp256k1_eckey_privkey_tweak_add(secp256k1_scalar_t *key, const secp

 static int secp256k1_eckey_pubkey_tweak_add(secp256k1_ge_t *key, const secp256k1_scalar_t *tweak) {
    secp256k1_gej_t pt;
-    secp256k1_gej_set_ge(&pt, key);
    secp256k1_scalar_t one;
+    secp256k1_gej_set_ge(&pt, key);
    secp256k1_scalar_set_int(&one, 1);
    secp256k1_ecmult(&pt, &pt, &one, tweak);

@ -176,12 +175,12 @@ static int secp256k1_eckey_privkey_tweak_mul(secp256k1_scalar_t *key, const secp
 }

 static int secp256k1_eckey_pubkey_tweak_mul(secp256k1_ge_t *key, const secp256k1_scalar_t *tweak) {
+    secp256k1_scalar_t zero;
+    secp256k1_gej_t pt;
    if (secp256k1_scalar_is_zero(tweak))
        return 0;

-    secp256k1_scalar_t zero;
    secp256k1_scalar_set_int(&zero, 0);
-    secp256k1_gej_t pt;
    secp256k1_gej_set_ge(&pt, key);
    secp256k1_ecmult(&pt, &pt, tweak, &zero);
    secp256k1_ge_set_gej(key, &pt);
--- a/src/ecmult_gen_impl.h
+++ b/src/ecmult_gen_impl.h
@ -24,49 +24,53 @@ typedef struct {
     * None of the resulting prec group elements have a known scalar, and neither do any of
     * the intermediate sums while computing a*G.
     */
-    secp256k1_fe_t prec[64][16][2]; /* prec[j][i] = (16^j * i * G + U_i).{x,y} */
+    secp256k1_ge_storage_t prec[64][16]; /* prec[j][i] = 16^j * i * G + U_i */
 } secp256k1_ecmult_gen_consts_t;

 static const secp256k1_ecmult_gen_consts_t *secp256k1_ecmult_gen_consts = NULL;

 static void secp256k1_ecmult_gen_start(void) {
+    secp256k1_ge_t prec[1024];
+    secp256k1_gej_t gj;
+    secp256k1_gej_t nums_gej;
+    secp256k1_ecmult_gen_consts_t *ret;
+    int i, j;
    if (secp256k1_ecmult_gen_consts != NULL)
        return;

    /* Allocate the precomputation table. */
-    secp256k1_ecmult_gen_consts_t *ret = (secp256k1_ecmult_gen_consts_t*)checked_malloc(sizeof(secp256k1_ecmult_gen_consts_t));
+    ret = (secp256k1_ecmult_gen_consts_t*)checked_malloc(sizeof(secp256k1_ecmult_gen_consts_t));

    /* get the generator */
-    const secp256k1_ge_t *g = &secp256k1_ge_consts->g;
-    secp256k1_gej_t gj; secp256k1_gej_set_ge(&gj, g);
+    secp256k1_gej_set_ge(&gj, &secp256k1_ge_const_g);

    /* Construct a group element with no known corresponding scalar (nothing up my sleeve). */
-    secp256k1_gej_t nums_gej;
    {
-        static const unsigned char nums_b32[32] = "The scalar for this x is unknown";
+        static const unsigned char nums_b32[33] = "The scalar for this x is unknown";
        secp256k1_fe_t nums_x;
-        VERIFY_CHECK(secp256k1_fe_set_b32(&nums_x, nums_b32));
        secp256k1_ge_t nums_ge;
+        VERIFY_CHECK(secp256k1_fe_set_b32(&nums_x, nums_b32));
        VERIFY_CHECK(secp256k1_ge_set_xo_var(&nums_ge, &nums_x, 0));
        secp256k1_gej_set_ge(&nums_gej, &nums_ge);
        /* Add G to make the bits in x uniformly distributed. */
-        secp256k1_gej_add_ge_var(&nums_gej, &nums_gej, g);
+        secp256k1_gej_add_ge_var(&nums_gej, &nums_gej, &secp256k1_ge_const_g);
    }

    /* compute prec. */
-    secp256k1_ge_t prec[1024];
    {
        secp256k1_gej_t precj[1024]; /* Jacobian versions of prec. */
-        secp256k1_gej_t gbase; gbase = gj; /* 16^j * G */
-        secp256k1_gej_t numsbase; numsbase = nums_gej; /* 2^j * nums. */
-        for (int j=0; j<64; j++) {
+        secp256k1_gej_t gbase;
+        secp256k1_gej_t numsbase;
+        gbase = gj; /* 16^j * G */
+        numsbase = nums_gej; /* 2^j * nums. */
+        for (j = 0; j < 64; j++) {
            /* Set precj[j*16 .. j*16+15] to (numsbase, numsbase + gbase, ..., numsbase + 15*gbase). */
            precj[j*16] = numsbase;
-            for (int i=1; i<16; i++) {
+            for (i = 1; i < 16; i++) {
                secp256k1_gej_add_var(&precj[j*16 + i], &precj[j*16 + i - 1], &gbase);
            }
            /* Multiply gbase by 16. */
-            for (int i=0; i<4; i++) {
+            for (i = 0; i < 4; i++) {
                secp256k1_gej_double_var(&gbase, &gbase);
            }
            /* Multiply numbase by 2. */
@ -79,11 +83,9 @@ static void secp256k1_ecmult_gen_start(void) {
        }
        secp256k1_ge_set_all_gej_var(1024, prec, precj);
    }
-    for (int j=0; j<64; j++) {
-        for (int i=0; i<16; i++) {
-            VERIFY_CHECK(!secp256k1_ge_is_infinity(&prec[j*16 + i]));
-            ret->prec[j][i][0] = prec[j*16 + i].x;
-            ret->prec[j][i][1] = prec[j*16 + i].y;
+    for (j = 0; j < 64; j++) {
+        for (i = 0; i < 16; i++) {
+            secp256k1_ge_to_storage(&ret->prec[j][i], &prec[j*16 + i]);
        }
    }

@ -92,26 +94,29 @@ static void secp256k1_ecmult_gen_start(void) {
 }

 static void secp256k1_ecmult_gen_stop(void) {
+    secp256k1_ecmult_gen_consts_t *c;
    if (secp256k1_ecmult_gen_consts == NULL)
        return;

-    secp256k1_ecmult_gen_consts_t *c = (secp256k1_ecmult_gen_consts_t*)secp256k1_ecmult_gen_consts;
+    c = (secp256k1_ecmult_gen_consts_t*)secp256k1_ecmult_gen_consts;
    secp256k1_ecmult_gen_consts = NULL;
    free(c);
 }

 static void secp256k1_ecmult_gen(secp256k1_gej_t *r, const secp256k1_scalar_t *gn) {
    const secp256k1_ecmult_gen_consts_t *c = secp256k1_ecmult_gen_consts;
-    secp256k1_gej_set_infinity(r);
    secp256k1_ge_t add;
-    add.infinity = 0;
+    secp256k1_ge_storage_t adds;
    int bits;
-    for (int j=0; j<64; j++) {
+    int i, j;
+    secp256k1_gej_set_infinity(r);
+    add.infinity = 0;
+    for (j = 0; j < 64; j++) {
        bits = secp256k1_scalar_get_bits(gn, j * 4, 4);
-        for (int i=0; i<16; i++) {
-            secp256k1_fe_cmov(&add.x, &c->prec[j][i][0], i == bits);
-            secp256k1_fe_cmov(&add.y, &c->prec[j][i][1], i == bits);
+        for (i = 0; i < 16; i++) {
+            secp256k1_ge_storage_cmov(&adds, &c->prec[j][i], i == bits);
        }
+        secp256k1_ge_from_storage(&add, &adds);
        secp256k1_gej_add_ge(r, r, &add);
    }
    bits = 0;
--- a/src/ecmult_impl.h
+++ b/src/ecmult_impl.h
@ -37,22 +37,31 @@
 *  G is constant, so it only needs to be done once in advance.
 */
 static void secp256k1_ecmult_table_precomp_gej_var(secp256k1_gej_t *pre, const secp256k1_gej_t *a, int w) {
+    secp256k1_gej_t d;
+    int i;
    pre[0] = *a;
-    secp256k1_gej_t d; secp256k1_gej_double_var(&d, &pre[0]);
-    for (int i=1; i<(1 << (w-2)); i++)
+    secp256k1_gej_double_var(&d, &pre[0]);
+    for (i = 1; i < (1 << (w-2)); i++)
        secp256k1_gej_add_var(&pre[i], &d, &pre[i-1]);
 }

-static void secp256k1_ecmult_table_precomp_ge_var(secp256k1_ge_t *pre, const secp256k1_gej_t *a, int w) {
+static void secp256k1_ecmult_table_precomp_ge_storage_var(secp256k1_ge_storage_t *pre, const secp256k1_gej_t *a, int w) {
+    secp256k1_gej_t d;
+    int i;
    const int table_size = 1 << (w-2);
    secp256k1_gej_t *prej = checked_malloc(sizeof(secp256k1_gej_t) * table_size);
+    secp256k1_ge_t *prea = checked_malloc(sizeof(secp256k1_ge_t) * table_size);
    prej[0] = *a;
-    secp256k1_gej_t d; secp256k1_gej_double_var(&d, a);
-    for (int i=1; i<table_size; i++) {
+    secp256k1_gej_double_var(&d, a);
+    for (i = 1; i < table_size; i++) {
        secp256k1_gej_add_var(&prej[i], &d, &prej[i-1]);
    }
-    secp256k1_ge_set_all_gej_var(table_size, pre, prej);
+    secp256k1_ge_set_all_gej_var(table_size, prea, prej);
+    for (i = 0; i < table_size; i++) {
+        secp256k1_ge_to_storage(&pre[i], &prea[i]);
+    }
    free(prej);
+    free(prea);
 }

 /** The number of entries a table with precomputed multiples needs to have. */
@ -60,51 +69,63 @@ static void secp256k1_ecmult_table_precomp_ge_var(secp256k1_ge_t *pre, const sec

 /** The following two macro retrieves a particular odd multiple from a table
 *  of precomputed multiples. */
-#define ECMULT_TABLE_GET(r,pre,n,w,neg) do { \
+#define ECMULT_TABLE_GET_GEJ(r,pre,n,w) do { \
    VERIFY_CHECK(((n) & 1) == 1); \
    VERIFY_CHECK((n) >= -((1 << ((w)-1)) - 1)); \
    VERIFY_CHECK((n) <=  ((1 << ((w)-1)) - 1)); \
    if ((n) > 0) \
        *(r) = (pre)[((n)-1)/2]; \
    else \
-        (neg)((r), &(pre)[(-(n)-1)/2]); \
+        secp256k1_gej_neg((r), &(pre)[(-(n)-1)/2]); \
+} while(0)
+#define ECMULT_TABLE_GET_GE_STORAGE(r,pre,n,w) do { \
+    VERIFY_CHECK(((n) & 1) == 1); \
+    VERIFY_CHECK((n) >= -((1 << ((w)-1)) - 1)); \
+    VERIFY_CHECK((n) <=  ((1 << ((w)-1)) - 1)); \
+    if ((n) > 0) \
+        secp256k1_ge_from_storage((r), &(pre)[((n)-1)/2]); \
+    else {\
+        secp256k1_ge_from_storage((r), &(pre)[(-(n)-1)/2]); \
+        secp256k1_ge_neg((r), (r)); \
+    } \
 } while(0)
-
-#define ECMULT_TABLE_GET_GEJ(r,pre,n,w) ECMULT_TABLE_GET((r),(pre),(n),(w),secp256k1_gej_neg)
-#define ECMULT_TABLE_GET_GE(r,pre,n,w)  ECMULT_TABLE_GET((r),(pre),(n),(w),secp256k1_ge_neg)

 typedef struct {
    /* For accelerating the computation of a*P + b*G: */
-    secp256k1_ge_t pre_g[ECMULT_TABLE_SIZE(WINDOW_G)];    /* odd multiples of the generator */
+    secp256k1_ge_storage_t pre_g[ECMULT_TABLE_SIZE(WINDOW_G)];    /* odd multiples of the generator */
 #ifdef USE_ENDOMORPHISM
-    secp256k1_ge_t pre_g_128[ECMULT_TABLE_SIZE(WINDOW_G)]; /* odd multiples of 2^128*generator */
+    secp256k1_ge_storage_t pre_g_128[ECMULT_TABLE_SIZE(WINDOW_G)]; /* odd multiples of 2^128*generator */
 #endif
 } secp256k1_ecmult_consts_t;

 static const secp256k1_ecmult_consts_t *secp256k1_ecmult_consts = NULL;

 static void secp256k1_ecmult_start(void) {
+    secp256k1_gej_t gj;
+    secp256k1_ecmult_consts_t *ret;
    if (secp256k1_ecmult_consts != NULL)
        return;

    /* Allocate the precomputation table. */
-    secp256k1_ecmult_consts_t *ret = (secp256k1_ecmult_consts_t*)checked_malloc(sizeof(secp256k1_ecmult_consts_t));
+    ret = (secp256k1_ecmult_consts_t*)checked_malloc(sizeof(secp256k1_ecmult_consts_t));

    /* get the generator */
-    const secp256k1_ge_t *g = &secp256k1_ge_consts->g;
-    secp256k1_gej_t gj; secp256k1_gej_set_ge(&gj, g);
+    secp256k1_gej_set_ge(&gj, &secp256k1_ge_const_g);

-#ifdef USE_ENDOMORPHISM
-    /* calculate 2^128*generator */
-    secp256k1_gej_t g_128j = gj;
-    for (int i=0; i<128; i++)
-        secp256k1_gej_double_var(&g_128j, &g_128j);
-#endif

    /* precompute the tables with odd multiples */
-    secp256k1_ecmult_table_precomp_ge_var(ret->pre_g, &gj, WINDOW_G);
+    secp256k1_ecmult_table_precomp_ge_storage_var(ret->pre_g, &gj, WINDOW_G);
+
 #ifdef USE_ENDOMORPHISM
-    secp256k1_ecmult_table_precomp_ge_var(ret->pre_g_128, &g_128j, WINDOW_G);
+    {
+        secp256k1_gej_t g_128j;
+        int i;
+        /* calculate 2^128*generator */
+        g_128j = gj;
+        for (i = 0; i < 128; i++)
+            secp256k1_gej_double_var(&g_128j, &g_128j);
+        secp256k1_ecmult_table_precomp_ge_storage_var(ret->pre_g_128, &g_128j, WINDOW_G);
+    }
 #endif

    /* Set the global pointer to the precomputation table. */
@ -112,10 +133,11 @@ static void secp256k1_ecmult_start(void) {
 }

 static void secp256k1_ecmult_stop(void) {
+    secp256k1_ecmult_consts_t *c;
    if (secp256k1_ecmult_consts == NULL)
        return;

-    secp256k1_ecmult_consts_t *c = (secp256k1_ecmult_consts_t*)secp256k1_ecmult_consts;
+    c = (secp256k1_ecmult_consts_t*)secp256k1_ecmult_consts;
    secp256k1_ecmult_consts = NULL;
    free(c);
 }
@ -129,16 +151,18 @@ static void secp256k1_ecmult_stop(void) {
 */
 static int secp256k1_ecmult_wnaf(int *wnaf, const secp256k1_scalar_t *a, int w) {
    secp256k1_scalar_t s = *a;
-
+    int set_bits = 0;
+    int bit = 0;
    int sign = 1;
+
    if (secp256k1_scalar_get_bits(&s, 255, 1)) {
        secp256k1_scalar_negate(&s, &s);
        sign = -1;
    }

-    int set_bits = 0;
-    int bit = 0;
    while (bit < 256) {
+        int now;
+        int word;
        if (secp256k1_scalar_get_bits(&s, bit, 1) == 0) {
            bit++;
            continue;
@ -146,11 +170,11 @@ static int secp256k1_ecmult_wnaf(int *wnaf, const secp256k1_scalar_t *a, int w)
        while (set_bits < bit) {
            wnaf[set_bits++] = 0;
        }
-        int now = w;
+        now = w;
        if (bit + now > 256) {
            now = 256 - bit;
        }
-        int word = secp256k1_scalar_get_bits_var(&s, bit, now);
+        word = secp256k1_scalar_get_bits_var(&s, bit, now);
        if (word & (1 << (w-1))) {
            secp256k1_scalar_add_bit(&s, bit + w);
            wnaf[set_bits++] = sign * (word - (1 << w));
@ -163,58 +187,74 @@ static int secp256k1_ecmult_wnaf(int *wnaf, const secp256k1_scalar_t *a, int w)
 }

 static void secp256k1_ecmult(secp256k1_gej_t *r, const secp256k1_gej_t *a, const secp256k1_scalar_t *na, const secp256k1_scalar_t *ng) {
+    secp256k1_gej_t tmpj;
+    secp256k1_gej_t pre_a[ECMULT_TABLE_SIZE(WINDOW_A)];
+    secp256k1_ge_t tmpa;
    const secp256k1_ecmult_consts_t *c = secp256k1_ecmult_consts;
-
 #ifdef USE_ENDOMORPHISM
+    secp256k1_gej_t pre_a_lam[ECMULT_TABLE_SIZE(WINDOW_A)];
    secp256k1_scalar_t na_1, na_lam;
+    /* Splitted G factors. */
+    secp256k1_scalar_t ng_1, ng_128;
+    int wnaf_na_1[130];
+    int wnaf_na_lam[130];
+    int bits_na_1;
+    int bits_na_lam;
+    int wnaf_ng_1[129];
+    int bits_ng_1;
+    int wnaf_ng_128[129];
+    int bits_ng_128;
+#else
+    int wnaf_na[256];
+    int bits_na;
+    int wnaf_ng[257];
+    int bits_ng;
+#endif
+    int i;
+    int bits;
+
+#ifdef USE_ENDOMORPHISM
    /* split na into na_1 and na_lam (where na = na_1 + na_lam*lambda, and na_1 and na_lam are ~128 bit) */
    secp256k1_scalar_split_lambda_var(&na_1, &na_lam, na);

    /* build wnaf representation for na_1 and na_lam. */
-    int wnaf_na_1[130];   int bits_na_1   = secp256k1_ecmult_wnaf(wnaf_na_1,   &na_1,   WINDOW_A);
-    int wnaf_na_lam[130]; int bits_na_lam = secp256k1_ecmult_wnaf(wnaf_na_lam, &na_lam, WINDOW_A);
+    bits_na_1   = secp256k1_ecmult_wnaf(wnaf_na_1,   &na_1,   WINDOW_A);
+    bits_na_lam = secp256k1_ecmult_wnaf(wnaf_na_lam, &na_lam, WINDOW_A);
    VERIFY_CHECK(bits_na_1 <= 130);
    VERIFY_CHECK(bits_na_lam <= 130);
-    int bits = bits_na_1;
+    bits = bits_na_1;
    if (bits_na_lam > bits) bits = bits_na_lam;
 #else
    /* build wnaf representation for na. */
-    int wnaf_na[256];     int bits_na     = secp256k1_ecmult_wnaf(wnaf_na,     na,      WINDOW_A);
-    int bits = bits_na;
+    bits_na     = secp256k1_ecmult_wnaf(wnaf_na,     na,      WINDOW_A);
+    bits = bits_na;
 #endif

    /* calculate odd multiples of a */
-    secp256k1_gej_t pre_a[ECMULT_TABLE_SIZE(WINDOW_A)];
    secp256k1_ecmult_table_precomp_gej_var(pre_a, a, WINDOW_A);

 #ifdef USE_ENDOMORPHISM
-    secp256k1_gej_t pre_a_lam[ECMULT_TABLE_SIZE(WINDOW_A)];
-    for (int i=0; i<ECMULT_TABLE_SIZE(WINDOW_A); i++)
+    for (i = 0; i < ECMULT_TABLE_SIZE(WINDOW_A); i++)
        secp256k1_gej_mul_lambda(&pre_a_lam[i], &pre_a[i]);

-    /* Splitted G factors. */
-    secp256k1_scalar_t ng_1, ng_128;
-
    /* split ng into ng_1 and ng_128 (where gn = gn_1 + gn_128*2^128, and gn_1 and gn_128 are ~128 bit) */
    secp256k1_scalar_split_128(&ng_1, &ng_128, ng);

    /* Build wnaf representation for ng_1 and ng_128 */
-    int wnaf_ng_1[129];   int bits_ng_1   = secp256k1_ecmult_wnaf(wnaf_ng_1,   &ng_1,   WINDOW_G);
-    int wnaf_ng_128[129]; int bits_ng_128 = secp256k1_ecmult_wnaf(wnaf_ng_128, &ng_128, WINDOW_G);
+    bits_ng_1   = secp256k1_ecmult_wnaf(wnaf_ng_1,   &ng_1,   WINDOW_G);
+    bits_ng_128 = secp256k1_ecmult_wnaf(wnaf_ng_128, &ng_128, WINDOW_G);
    if (bits_ng_1 > bits) bits = bits_ng_1;
    if (bits_ng_128 > bits) bits = bits_ng_128;
 #else
-    int wnaf_ng[257];     int bits_ng     = secp256k1_ecmult_wnaf(wnaf_ng,     ng,      WINDOW_G);
+    bits_ng     = secp256k1_ecmult_wnaf(wnaf_ng,     ng,      WINDOW_G);
    if (bits_ng > bits) bits = bits_ng;
 #endif

    secp256k1_gej_set_infinity(r);
-    secp256k1_gej_t tmpj;
-    secp256k1_ge_t tmpa;

-    for (int i=bits-1; i>=0; i--) {
-        secp256k1_gej_double_var(r, r);
+    for (i = bits-1; i >= 0; i--) {
        int n;
+        secp256k1_gej_double_var(r, r);
 #ifdef USE_ENDOMORPHISM
        if (i < bits_na_1 && (n = wnaf_na_1[i])) {
            ECMULT_TABLE_GET_GEJ(&tmpj, pre_a, n, WINDOW_A);
@ -225,11 +265,11 @@ static void secp256k1_ecmult(secp256k1_gej_t *r, const secp256k1_gej_t *a, const
            secp256k1_gej_add_var(r, r, &tmpj);
        }
        if (i < bits_ng_1 && (n = wnaf_ng_1[i])) {
-            ECMULT_TABLE_GET_GE(&tmpa, c->pre_g, n, WINDOW_G);
+            ECMULT_TABLE_GET_GE_STORAGE(&tmpa, c->pre_g, n, WINDOW_G);
            secp256k1_gej_add_ge_var(r, r, &tmpa);
        }
        if (i < bits_ng_128 && (n = wnaf_ng_128[i])) {
-            ECMULT_TABLE_GET_GE(&tmpa, c->pre_g_128, n, WINDOW_G);
+            ECMULT_TABLE_GET_GE_STORAGE(&tmpa, c->pre_g_128, n, WINDOW_G);
            secp256k1_gej_add_ge_var(r, r, &tmpa);
        }
 #else
@ -238,7 +278,7 @@ static void secp256k1_ecmult(secp256k1_gej_t *r, const secp256k1_gej_t *a, const
            secp256k1_gej_add_var(r, r, &tmpj);
        }
        if (i < bits_ng && (n = wnaf_ng[i])) {
-            ECMULT_TABLE_GET_GE(&tmpa, c->pre_g, n, WINDOW_G);
+            ECMULT_TABLE_GET_GE_STORAGE(&tmpa, c->pre_g, n, WINDOW_G);
            secp256k1_gej_add_ge_var(r, r, &tmpa);
        }
 #endif
--- a/src/field.h
+++ b/src/field.h
@ -30,21 +30,6 @@
 #error "Please select field implementation"
 #endif

-typedef struct {
-#ifndef USE_NUM_NONE
-    secp256k1_num_t p;
-#endif
-    secp256k1_fe_t order;
-} secp256k1_fe_consts_t;
-
-static const secp256k1_fe_consts_t *secp256k1_fe_consts = NULL;
-
-/** Initialize field element precomputation data. */
-static void secp256k1_fe_start(void);
-
-/** Unload field element precomputation data. */
-static void secp256k1_fe_stop(void);
-
 /** Normalize a field element. */
 static void secp256k1_fe_normalize(secp256k1_fe_t *r);

@ -117,15 +102,15 @@ static void secp256k1_fe_inv_var(secp256k1_fe_t *r, const secp256k1_fe_t *a);
 /** Calculate the (modular) inverses of a batch of field elements. Requires the inputs' magnitudes to be
 *  at most 8. The output magnitudes are 1 (but not guaranteed to be normalized). The inputs and
 *  outputs must not overlap in memory. */
-static void secp256k1_fe_inv_all_var(size_t len, secp256k1_fe_t r[len], const secp256k1_fe_t a[len]);
+static void secp256k1_fe_inv_all_var(size_t len, secp256k1_fe_t *r, const secp256k1_fe_t *a);

-/** Convert a field element to a hexadecimal string. */
-static void secp256k1_fe_get_hex(char *r, int *rlen, const secp256k1_fe_t *a);
+/** Convert a field element to the storage type. */
+static void secp256k1_fe_to_storage(secp256k1_fe_storage_t *r, const secp256k1_fe_t*);

-/** Convert a hexadecimal string to a field element. */
-static int secp256k1_fe_set_hex(secp256k1_fe_t *r, const char *a, int alen);
+/** Convert a field element back from the storage type. */
+static void secp256k1_fe_from_storage(secp256k1_fe_t *r, const secp256k1_fe_storage_t*);

 /** If flag is true, set *r equal to *a; otherwise leave it. Constant-time. */
-static void secp256k1_fe_cmov(secp256k1_fe_t *r, const secp256k1_fe_t *a, int flag);
+static void secp256k1_fe_storage_cmov(secp256k1_fe_storage_t *r, const secp256k1_fe_storage_t *a, int flag);

 #endif
--- a/src/field_10x26.h
+++ b/src/field_10x26.h
@ -18,4 +18,30 @@ typedef struct {
 #endif
 } secp256k1_fe_t;

+/* Unpacks a constant into a overlapping multi-limbed FE element. */
+#define SECP256K1_FE_CONST_INNER(d7, d6, d5, d4, d3, d2, d1, d0) { \
+    (d0) & 0x3FFFFFFUL, \
+    ((d0) >> 26) | ((d1) & 0xFFFFFUL) << 6, \
+    ((d1) >> 20) | ((d2) & 0x3FFFUL) << 12, \
+    ((d2) >> 14) | ((d3) & 0xFFUL) << 18, \
+    ((d3) >> 8) | ((d4) & 0x3) << 24, \
+    ((d4) >> 2) & 0x3FFFFFFUL, \
+    ((d4) >> 28) | ((d5) & 0x3FFFFFUL) << 4, \
+    ((d5) >> 22) | ((d6) & 0xFFFF) << 10, \
+    ((d6) >> 16) | ((d7) & 0x3FF) << 16, \
+    ((d7) >> 10) \
+}
+
+#ifdef VERIFY
+#define SECP256K1_FE_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {SECP256K1_FE_CONST_INNER((d7), (d6), (d5), (d4), (d3), (d2), (d1), (d0)), 1, 1}
+#else
+#define SECP256K1_FE_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {SECP256K1_FE_CONST_INNER((d7), (d6), (d5), (d4), (d3), (d2), (d1), (d0))}
+#endif
+
+typedef struct {
+    uint32_t n[8];
+} secp256k1_fe_storage_t;
+
+#define SECP256K1_FE_STORAGE_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {{ (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }}
+
 #endif
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@ -13,9 +13,6 @@
 #include "num.h"
 #include "field.h"

-static void secp256k1_fe_inner_start(void) {}
-static void secp256k1_fe_inner_stop(void) {}
-
 #ifdef VERIFY
 static void secp256k1_fe_verify(const secp256k1_fe_t *a) {
    const uint32_t *d = a->n;
@ -54,8 +51,8 @@ static void secp256k1_fe_normalize(secp256k1_fe_t *r) {
             t5 = r->n[5], t6 = r->n[6], t7 = r->n[7], t8 = r->n[8], t9 = r->n[9];

    /* Reduce t9 at the start so there will be at most a single carry from the first pass */
-    uint32_t x = t9 >> 22; t9 &= 0x03FFFFFUL;
    uint32_t m;
+    uint32_t x = t9 >> 22; t9 &= 0x03FFFFFUL;

    /* The first pass ensures the magnitude is 1, ... */
    t0 += x * 0x3D1UL; t1 += (x << 6);
@ -140,8 +137,8 @@ static void secp256k1_fe_normalize_var(secp256k1_fe_t *r) {
             t5 = r->n[5], t6 = r->n[6], t7 = r->n[7], t8 = r->n[8], t9 = r->n[9];

    /* Reduce t9 at the start so there will be at most a single carry from the first pass */
-    uint32_t x = t9 >> 22; t9 &= 0x03FFFFFUL;
    uint32_t m;
+    uint32_t x = t9 >> 22; t9 &= 0x03FFFFFUL;

    /* The first pass ensures the magnitude is 1, ... */
    t0 += x * 0x3D1UL; t1 += (x << 6);
@ -195,12 +192,12 @@ static int secp256k1_fe_normalizes_to_zero(secp256k1_fe_t *r) {
    uint32_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4],
             t5 = r->n[5], t6 = r->n[6], t7 = r->n[7], t8 = r->n[8], t9 = r->n[9];

-    /* Reduce t9 at the start so there will be at most a single carry from the first pass */
-    uint32_t x = t9 >> 22; t9 &= 0x03FFFFFUL;
-
    /* z0 tracks a possible raw value of 0, z1 tracks a possible raw value of P */
    uint32_t z0, z1;

+    /* Reduce t9 at the start so there will be at most a single carry from the first pass */
+    uint32_t x = t9 >> 22; t9 &= 0x03FFFFFUL;
+
    /* The first pass ensures the magnitude is 1, ... */
    t0 += x * 0x3D1UL; t1 += (x << 6);
    t1 += (t0 >> 26); t0 &= 0x3FFFFFFUL; z0  = t0; z1  = t0 ^ 0x3D0UL;
@ -221,23 +218,36 @@ static int secp256k1_fe_normalizes_to_zero(secp256k1_fe_t *r) {
 }

 static int secp256k1_fe_normalizes_to_zero_var(secp256k1_fe_t *r) {
-    uint32_t t0 = r->n[0], t9 = r->n[9];
+    uint32_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
+    uint32_t z0, z1;
+    uint32_t x;
+
+    t0 = r->n[0];
+    t9 = r->n[9];

    /* Reduce t9 at the start so there will be at most a single carry from the first pass */
-    uint32_t x = t9 >> 22;
+    x = t9 >> 22;

    /* The first pass ensures the magnitude is 1, ... */
    t0 += x * 0x3D1UL;

    /* z0 tracks a possible raw value of 0, z1 tracks a possible raw value of P */
-    uint32_t z0 = t0 & 0x3FFFFFFUL, z1 = z0 ^ 0x3D0UL;
+    z0 = t0 & 0x3FFFFFFUL;
+    z1 = z0 ^ 0x3D0UL;

    /* Fast return path should catch the majority of cases */
    if ((z0 != 0UL) & (z1 != 0x3FFFFFFUL))
        return 0;

-    uint32_t t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4],
-             t5 = r->n[5], t6 = r->n[6], t7 = r->n[7], t8 = r->n[8];
+    t1 = r->n[1];
+    t2 = r->n[2];
+    t3 = r->n[3];
+    t4 = r->n[4];
+    t5 = r->n[5];
+    t6 = r->n[6];
+    t7 = r->n[7];
+    t8 = r->n[8];
+
    t9 &= 0x03FFFFFUL;
    t1 += (x << 6);

@ -269,11 +279,11 @@ SECP256K1_INLINE static void secp256k1_fe_set_int(secp256k1_fe_t *r, int a) {
 }

 SECP256K1_INLINE static int secp256k1_fe_is_zero(const secp256k1_fe_t *a) {
+    const uint32_t *t = a->n;
 #ifdef VERIFY
    VERIFY_CHECK(a->normalized);
    secp256k1_fe_verify(a);
 #endif
-    const uint32_t *t = a->n;
    return (t[0] | t[1] | t[2] | t[3] | t[4] | t[5] | t[6] | t[7] | t[8] | t[9]) == 0;
 }

@ -286,23 +296,25 @@ SECP256K1_INLINE static int secp256k1_fe_is_odd(const secp256k1_fe_t *a) {
 }

 SECP256K1_INLINE static void secp256k1_fe_clear(secp256k1_fe_t *a) {
+    int i;
 #ifdef VERIFY
    a->magnitude = 0;
    a->normalized = 1;
 #endif
-    for (int i=0; i<10; i++) {
+    for (i=0; i<10; i++) {
        a->n[i] = 0;
    }
 }

 static int secp256k1_fe_cmp_var(const secp256k1_fe_t *a, const secp256k1_fe_t *b) {
+    int i;
 #ifdef VERIFY
    VERIFY_CHECK(a->normalized);
    VERIFY_CHECK(b->normalized);
    secp256k1_fe_verify(a);
    secp256k1_fe_verify(b);
 #endif
-    for (int i = 9; i >= 0; i--) {
+    for (i = 9; i >= 0; i--) {
        if (a->n[i] > b->n[i]) return 1;
        if (a->n[i] < b->n[i]) return -1;
    }
@ -310,10 +322,12 @@ static int secp256k1_fe_cmp_var(const secp256k1_fe_t *a, const secp256k1_fe_t *b
 }

 static int secp256k1_fe_set_b32(secp256k1_fe_t *r, const unsigned char *a) {
+    int i;
    r->n[0] = r->n[1] = r->n[2] = r->n[3] = r->n[4] = 0;
    r->n[5] = r->n[6] = r->n[7] = r->n[8] = r->n[9] = 0;
-    for (int i=0; i<32; i++) {
-        for (int j=0; j<4; j++) {
+    for (i=0; i<32; i++) {
+        int j;
+        for (j=0; j<4; j++) {
            int limb = (8*i+2*j)/26;
            int shift = (8*i+2*j)%26;
            r->n[limb] |= (uint32_t)((a[31-i] >> (2*j)) & 0x3) << shift;
@ -332,13 +346,15 @@ static int secp256k1_fe_set_b32(secp256k1_fe_t *r, const unsigned char *a) {

 /** Convert a field element to a 32-byte big endian value. Requires the input to be normalized */
 static void secp256k1_fe_get_b32(unsigned char *r, const secp256k1_fe_t *a) {
+    int i;
 #ifdef VERIFY
    VERIFY_CHECK(a->normalized);
    secp256k1_fe_verify(a);
 #endif
-    for (int i=0; i<32; i++) {
+    for (i=0; i<32; i++) {
+        int j;
        int c = 0;
-        for (int j=0; j<4; j++) {
+        for (j=0; j<4; j++) {
            int limb = (8*i+2*j)/26;
            int shift = (8*i+2*j)%26;
            c |= ((a->n[limb] >> shift) & 0x3) << (2 * j);
@ -415,6 +431,11 @@ SECP256K1_INLINE static void secp256k1_fe_add(secp256k1_fe_t *r, const secp256k1
 #endif

 SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t *a, const uint32_t * SECP256K1_RESTRICT b) {
+    uint64_t c, d;
+    uint64_t u0, u1, u2, u3, u4, u5, u6, u7, u8;
+    uint32_t t9, t1, t0, t2, t3, t4, t5, t6, t7;
+    const uint32_t M = 0x3FFFFFFUL, R0 = 0x3D10UL, R1 = 0x400UL;
+
    VERIFY_BITS(a[0], 30);
    VERIFY_BITS(a[1], 30);
    VERIFY_BITS(a[2], 30);
@ -436,14 +457,11 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t
    VERIFY_BITS(b[8], 30);
    VERIFY_BITS(b[9], 26);

-    const uint32_t M = 0x3FFFFFFUL, R0 = 0x3D10UL, R1 = 0x400UL;
    /** [... a b c] is a shorthand for ... + a<<52 + b<<26 + c<<0 mod n.
     *  px is a shorthand for sum(a[i]*b[x-i], i=0..x).
     *  Note that [x 0 0 0 0 0 0 0 0 0 0] = [x*R1 x*R0].
     */

-    uint64_t c, d;
-
    d  = (uint64_t)a[0] * b[9]
       + (uint64_t)a[1] * b[8]
       + (uint64_t)a[2] * b[7]
@ -456,7 +474,7 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t
       + (uint64_t)a[9] * b[0];
    /* VERIFY_BITS(d, 64); */
    /* [d 0 0 0 0 0 0 0 0 0] = [p9 0 0 0 0 0 0 0 0 0] */
-    uint32_t t9 = d & M; d >>= 26;
+    t9 = d & M; d >>= 26;
    VERIFY_BITS(t9, 26);
    VERIFY_BITS(d, 38);
    /* [d t9 0 0 0 0 0 0 0 0 0] = [p9 0 0 0 0 0 0 0 0 0] */
@ -475,12 +493,12 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t
       + (uint64_t)a[9] * b[1];
    VERIFY_BITS(d, 63);
    /* [d t9 0 0 0 0 0 0 0 0 c] = [p10 p9 0 0 0 0 0 0 0 0 p0] */
-    uint64_t u0 = d & M; d >>= 26; c += u0 * R0;
+    u0 = d & M; d >>= 26; c += u0 * R0;
    VERIFY_BITS(u0, 26);
    VERIFY_BITS(d, 37);
    VERIFY_BITS(c, 61);
    /* [d u0 t9 0 0 0 0 0 0 0 0 c-u0*R0] = [p10 p9 0 0 0 0 0 0 0 0 p0] */
-    uint32_t t0 = c & M; c >>= 26; c += u0 * R1;
+    t0 = c & M; c >>= 26; c += u0 * R1;
    VERIFY_BITS(t0, 26);
    VERIFY_BITS(c, 37);
    /* [d u0 t9 0 0 0 0 0 0 0 c-u0*R1 t0-u0*R0] = [p10 p9 0 0 0 0 0 0 0 0 p0] */
@ -500,12 +518,12 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t
       + (uint64_t)a[9] * b[2];
    VERIFY_BITS(d, 63);
    /* [d 0 t9 0 0 0 0 0 0 0 c t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] */
-    uint64_t u1 = d & M; d >>= 26; c += u1 * R0;
+    u1 = d & M; d >>= 26; c += u1 * R0;
    VERIFY_BITS(u1, 26);
    VERIFY_BITS(d, 37);
    VERIFY_BITS(c, 63);
    /* [d u1 0 t9 0 0 0 0 0 0 0 c-u1*R0 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] */
-    uint32_t t1 = c & M; c >>= 26; c += u1 * R1;
+    t1 = c & M; c >>= 26; c += u1 * R1;
    VERIFY_BITS(t1, 26);
    VERIFY_BITS(c, 38);
    /* [d u1 0 t9 0 0 0 0 0 0 c-u1*R1 t1-u1*R0 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] */
@ -525,12 +543,12 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t
       + (uint64_t)a[9] * b[3];
    VERIFY_BITS(d, 63);
    /* [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] */
-    uint64_t u2 = d & M; d >>= 26; c += u2 * R0;
+    u2 = d & M; d >>= 26; c += u2 * R0;
    VERIFY_BITS(u2, 26);
    VERIFY_BITS(d, 37);
    VERIFY_BITS(c, 63);
    /* [d u2 0 0 t9 0 0 0 0 0 0 c-u2*R0 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] */
-    uint32_t t2 = c & M; c >>= 26; c += u2 * R1;
+    t2 = c & M; c >>= 26; c += u2 * R1;
    VERIFY_BITS(t2, 26);
    VERIFY_BITS(c, 38);
    /* [d u2 0 0 t9 0 0 0 0 0 c-u2*R1 t2-u2*R0 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] */
@ -550,12 +568,12 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t
       + (uint64_t)a[9] * b[4];
    VERIFY_BITS(d, 63);
    /* [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] */
-    uint64_t u3 = d & M; d >>= 26; c += u3 * R0;
+    u3 = d & M; d >>= 26; c += u3 * R0;
    VERIFY_BITS(u3, 26);
    VERIFY_BITS(d, 37);
    /* VERIFY_BITS(c, 64); */
    /* [d u3 0 0 0 t9 0 0 0 0 0 c-u3*R0 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] */
-    uint32_t t3 = c & M; c >>= 26; c += u3 * R1;
+    t3 = c & M; c >>= 26; c += u3 * R1;
    VERIFY_BITS(t3, 26);
    VERIFY_BITS(c, 39);
    /* [d u3 0 0 0 t9 0 0 0 0 c-u3*R1 t3-u3*R0 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] */
@ -575,12 +593,12 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t
       + (uint64_t)a[9] * b[5];
    VERIFY_BITS(d, 62);
    /* [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] */
-    uint64_t u4 = d & M; d >>= 26; c += u4 * R0;
+    u4 = d & M; d >>= 26; c += u4 * R0;
    VERIFY_BITS(u4, 26);
    VERIFY_BITS(d, 36);
    /* VERIFY_BITS(c, 64); */
    /* [d u4 0 0 0 0 t9 0 0 0 0 c-u4*R0 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] */
-    uint32_t t4 = c & M; c >>= 26; c += u4 * R1;
+    t4 = c & M; c >>= 26; c += u4 * R1;
    VERIFY_BITS(t4, 26);
    VERIFY_BITS(c, 39);
    /* [d u4 0 0 0 0 t9 0 0 0 c-u4*R1 t4-u4*R0 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] */
@ -600,12 +618,12 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t
       + (uint64_t)a[9] * b[6];
    VERIFY_BITS(d, 62);
    /* [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] */
-    uint64_t u5 = d & M; d >>= 26; c += u5 * R0;
+    u5 = d & M; d >>= 26; c += u5 * R0;
    VERIFY_BITS(u5, 26);
    VERIFY_BITS(d, 36);
    /* VERIFY_BITS(c, 64); */
    /* [d u5 0 0 0 0 0 t9 0 0 0 c-u5*R0 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] */
-    uint32_t t5 = c & M; c >>= 26; c += u5 * R1;
+    t5 = c & M; c >>= 26; c += u5 * R1;
    VERIFY_BITS(t5, 26);
    VERIFY_BITS(c, 39);
    /* [d u5 0 0 0 0 0 t9 0 0 c-u5*R1 t5-u5*R0 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] */
@ -625,12 +643,12 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t
       + (uint64_t)a[9] * b[7];
    VERIFY_BITS(d, 61);
    /* [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] */
-    uint64_t u6 = d & M; d >>= 26; c += u6 * R0;
+    u6 = d & M; d >>= 26; c += u6 * R0;
    VERIFY_BITS(u6, 26);
    VERIFY_BITS(d, 35);
    /* VERIFY_BITS(c, 64); */
    /* [d u6 0 0 0 0 0 0 t9 0 0 c-u6*R0 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] */
-    uint32_t t6 = c & M; c >>= 26; c += u6 * R1;
+    t6 = c & M; c >>= 26; c += u6 * R1;
    VERIFY_BITS(t6, 26);
    VERIFY_BITS(c, 39);
    /* [d u6 0 0 0 0 0 0 t9 0 c-u6*R1 t6-u6*R0 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] */
@ -651,13 +669,13 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t
       + (uint64_t)a[9] * b[8];
    VERIFY_BITS(d, 58);
    /* [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] */
-    uint64_t u7 = d & M; d >>= 26; c += u7 * R0;
+    u7 = d & M; d >>= 26; c += u7 * R0;
    VERIFY_BITS(u7, 26);
    VERIFY_BITS(d, 32);
    /* VERIFY_BITS(c, 64); */
    VERIFY_CHECK(c <= 0x800001703FFFC2F7ULL);
    /* [d u7 0 0 0 0 0 0 0 t9 0 c-u7*R0 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] */
-    uint32_t t7 = c & M; c >>= 26; c += u7 * R1;
+    t7 = c & M; c >>= 26; c += u7 * R1;
    VERIFY_BITS(t7, 26);
    VERIFY_BITS(c, 38);
    /* [d u7 0 0 0 0 0 0 0 t9 c-u7*R1 t7-u7*R0 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] */
@ -678,7 +696,7 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t
    d += (uint64_t)a[9] * b[9];
    VERIFY_BITS(d, 57);
    /* [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
-    uint64_t u8 = d & M; d >>= 26; c += u8 * R0;
+    u8 = d & M; d >>= 26; c += u8 * R0;
    VERIFY_BITS(u8, 26);
    VERIFY_BITS(d, 31);
    /* VERIFY_BITS(c, 64); */
@ -742,6 +760,11 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t
 }

 SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t *a) {
+    uint64_t c, d;
+    uint64_t u0, u1, u2, u3, u4, u5, u6, u7, u8;
+    uint32_t t9, t0, t1, t2, t3, t4, t5, t6, t7;
+    const uint32_t M = 0x3FFFFFFUL, R0 = 0x3D10UL, R1 = 0x400UL;
+
    VERIFY_BITS(a[0], 30);
    VERIFY_BITS(a[1], 30);
    VERIFY_BITS(a[2], 30);
@ -753,14 +776,11 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t
    VERIFY_BITS(a[8], 30);
    VERIFY_BITS(a[9], 26);

-    const uint32_t M = 0x3FFFFFFUL, R0 = 0x3D10UL, R1 = 0x400UL;
    /** [... a b c] is a shorthand for ... + a<<52 + b<<26 + c<<0 mod n.
     *  px is a shorthand for sum(a[i]*a[x-i], i=0..x).
     *  Note that [x 0 0 0 0 0 0 0 0 0 0] = [x*R1 x*R0].
     */

-    uint64_t c, d;
-
    d  = (uint64_t)(a[0]*2) * a[9]
       + (uint64_t)(a[1]*2) * a[8]
       + (uint64_t)(a[2]*2) * a[7]
@ -768,7 +788,7 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t
       + (uint64_t)(a[4]*2) * a[5];
    /* VERIFY_BITS(d, 64); */
    /* [d 0 0 0 0 0 0 0 0 0] = [p9 0 0 0 0 0 0 0 0 0] */
-    uint32_t t9 = d & M; d >>= 26;
+    t9 = d & M; d >>= 26;
    VERIFY_BITS(t9, 26);
    VERIFY_BITS(d, 38);
    /* [d t9 0 0 0 0 0 0 0 0 0] = [p9 0 0 0 0 0 0 0 0 0] */
@ -783,12 +803,12 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t
       + (uint64_t)a[5] * a[5];
    VERIFY_BITS(d, 63);
    /* [d t9 0 0 0 0 0 0 0 0 c] = [p10 p9 0 0 0 0 0 0 0 0 p0] */
-    uint64_t u0 = d & M; d >>= 26; c += u0 * R0;
+    u0 = d & M; d >>= 26; c += u0 * R0;
    VERIFY_BITS(u0, 26);
    VERIFY_BITS(d, 37);
    VERIFY_BITS(c, 61);
    /* [d u0 t9 0 0 0 0 0 0 0 0 c-u0*R0] = [p10 p9 0 0 0 0 0 0 0 0 p0] */
-    uint32_t t0 = c & M; c >>= 26; c += u0 * R1;
+    t0 = c & M; c >>= 26; c += u0 * R1;
    VERIFY_BITS(t0, 26);
    VERIFY_BITS(c, 37);
    /* [d u0 t9 0 0 0 0 0 0 0 c-u0*R1 t0-u0*R0] = [p10 p9 0 0 0 0 0 0 0 0 p0] */
@ -803,12 +823,12 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t
       + (uint64_t)(a[5]*2) * a[6];
    VERIFY_BITS(d, 63);
    /* [d 0 t9 0 0 0 0 0 0 0 c t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] */
-    uint64_t u1 = d & M; d >>= 26; c += u1 * R0;
+    u1 = d & M; d >>= 26; c += u1 * R0;
    VERIFY_BITS(u1, 26);
    VERIFY_BITS(d, 37);
    VERIFY_BITS(c, 63);
    /* [d u1 0 t9 0 0 0 0 0 0 0 c-u1*R0 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] */
-    uint32_t t1 = c & M; c >>= 26; c += u1 * R1;
+    t1 = c & M; c >>= 26; c += u1 * R1;
    VERIFY_BITS(t1, 26);
    VERIFY_BITS(c, 38);
    /* [d u1 0 t9 0 0 0 0 0 0 c-u1*R1 t1-u1*R0 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] */
@ -824,12 +844,12 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t
       + (uint64_t)a[6] * a[6];
    VERIFY_BITS(d, 63);
    /* [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] */
-    uint64_t u2 = d & M; d >>= 26; c += u2 * R0;
+    u2 = d & M; d >>= 26; c += u2 * R0;
    VERIFY_BITS(u2, 26);
    VERIFY_BITS(d, 37);
    VERIFY_BITS(c, 63);
    /* [d u2 0 0 t9 0 0 0 0 0 0 c-u2*R0 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] */
-    uint32_t t2 = c & M; c >>= 26; c += u2 * R1;
+    t2 = c & M; c >>= 26; c += u2 * R1;
    VERIFY_BITS(t2, 26);
    VERIFY_BITS(c, 38);
    /* [d u2 0 0 t9 0 0 0 0 0 c-u2*R1 t2-u2*R0 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] */
@ -844,12 +864,12 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t
       + (uint64_t)(a[6]*2) * a[7];
    VERIFY_BITS(d, 63);
    /* [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] */
-    uint64_t u3 = d & M; d >>= 26; c += u3 * R0;
+    u3 = d & M; d >>= 26; c += u3 * R0;
    VERIFY_BITS(u3, 26);
    VERIFY_BITS(d, 37);
    /* VERIFY_BITS(c, 64); */
    /* [d u3 0 0 0 t9 0 0 0 0 0 c-u3*R0 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] */
-    uint32_t t3 = c & M; c >>= 26; c += u3 * R1;
+    t3 = c & M; c >>= 26; c += u3 * R1;
    VERIFY_BITS(t3, 26);
    VERIFY_BITS(c, 39);
    /* [d u3 0 0 0 t9 0 0 0 0 c-u3*R1 t3-u3*R0 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] */
@ -865,12 +885,12 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t
       + (uint64_t)a[7] * a[7];
    VERIFY_BITS(d, 62);
    /* [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] */
-    uint64_t u4 = d & M; d >>= 26; c += u4 * R0;
+    u4 = d & M; d >>= 26; c += u4 * R0;
    VERIFY_BITS(u4, 26);
    VERIFY_BITS(d, 36);
    /* VERIFY_BITS(c, 64); */
    /* [d u4 0 0 0 0 t9 0 0 0 0 c-u4*R0 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] */
-    uint32_t t4 = c & M; c >>= 26; c += u4 * R1;
+    t4 = c & M; c >>= 26; c += u4 * R1;
    VERIFY_BITS(t4, 26);
    VERIFY_BITS(c, 39);
    /* [d u4 0 0 0 0 t9 0 0 0 c-u4*R1 t4-u4*R0 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] */
@ -885,12 +905,12 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t
       + (uint64_t)(a[7]*2) * a[8];
    VERIFY_BITS(d, 62);
    /* [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] */
-    uint64_t u5 = d & M; d >>= 26; c += u5 * R0;
+    u5 = d & M; d >>= 26; c += u5 * R0;
    VERIFY_BITS(u5, 26);
    VERIFY_BITS(d, 36);
    /* VERIFY_BITS(c, 64); */
    /* [d u5 0 0 0 0 0 t9 0 0 0 c-u5*R0 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] */
-    uint32_t t5 = c & M; c >>= 26; c += u5 * R1;
+    t5 = c & M; c >>= 26; c += u5 * R1;
    VERIFY_BITS(t5, 26);
    VERIFY_BITS(c, 39);
    /* [d u5 0 0 0 0 0 t9 0 0 c-u5*R1 t5-u5*R0 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] */
@ -906,12 +926,12 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t
       + (uint64_t)a[8] * a[8];
    VERIFY_BITS(d, 61);
    /* [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] */
-    uint64_t u6 = d & M; d >>= 26; c += u6 * R0;
+    u6 = d & M; d >>= 26; c += u6 * R0;
    VERIFY_BITS(u6, 26);
    VERIFY_BITS(d, 35);
    /* VERIFY_BITS(c, 64); */
    /* [d u6 0 0 0 0 0 0 t9 0 0 c-u6*R0 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] */
-    uint32_t t6 = c & M; c >>= 26; c += u6 * R1;
+    t6 = c & M; c >>= 26; c += u6 * R1;
    VERIFY_BITS(t6, 26);
    VERIFY_BITS(c, 39);
    /* [d u6 0 0 0 0 0 0 t9 0 c-u6*R1 t6-u6*R0 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] */
@ -927,13 +947,13 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t
    d += (uint64_t)(a[8]*2) * a[9];
    VERIFY_BITS(d, 58);
    /* [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] */
-    uint64_t u7 = d & M; d >>= 26; c += u7 * R0;
+    u7 = d & M; d >>= 26; c += u7 * R0;
    VERIFY_BITS(u7, 26);
    VERIFY_BITS(d, 32);
    /* VERIFY_BITS(c, 64); */
    VERIFY_CHECK(c <= 0x800001703FFFC2F7ULL);
    /* [d u7 0 0 0 0 0 0 0 t9 0 c-u7*R0 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] */
-    uint32_t t7 = c & M; c >>= 26; c += u7 * R1;
+    t7 = c & M; c >>= 26; c += u7 * R1;
    VERIFY_BITS(t7, 26);
    VERIFY_BITS(c, 38);
    /* [d u7 0 0 0 0 0 0 0 t9 c-u7*R1 t7-u7*R0 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] */
@ -950,7 +970,7 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t
    d += (uint64_t)a[9] * a[9];
    VERIFY_BITS(d, 57);
    /* [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
-    uint64_t u8 = d & M; d >>= 26; c += u8 * R0;
+    u8 = d & M; d >>= 26; c += u8 * R0;
    VERIFY_BITS(u8, 26);
    VERIFY_BITS(d, 31);
    /* VERIFY_BITS(c, 64); */
@ -1043,8 +1063,10 @@ static void secp256k1_fe_sqr(secp256k1_fe_t *r, const secp256k1_fe_t *a) {
 #endif
 }

-static void secp256k1_fe_cmov(secp256k1_fe_t *r, const secp256k1_fe_t *a, int flag) {
-    uint32_t mask0 = flag + ~((uint32_t)0), mask1 = ~mask0;
+static SECP256K1_INLINE void secp256k1_fe_storage_cmov(secp256k1_fe_storage_t *r, const secp256k1_fe_storage_t *a, int flag) {
+    uint32_t mask0, mask1;
+    mask0 = flag + ~((uint32_t)0);
+    mask1 = ~mask0;
    r->n[0] = (r->n[0] & mask0) | (a->n[0] & mask1);
    r->n[1] = (r->n[1] & mask0) | (a->n[1] & mask1);
    r->n[2] = (r->n[2] & mask0) | (a->n[2] & mask1);
@ -1053,13 +1075,36 @@ static void secp256k1_fe_cmov(secp256k1_fe_t *r, const secp256k1_fe_t *a, int fl
    r->n[5] = (r->n[5] & mask0) | (a->n[5] & mask1);
    r->n[6] = (r->n[6] & mask0) | (a->n[6] & mask1);
    r->n[7] = (r->n[7] & mask0) | (a->n[7] & mask1);
-    r->n[8] = (r->n[8] & mask0) | (a->n[8] & mask1);
-    r->n[9] = (r->n[9] & mask0) | (a->n[9] & mask1);
+}
+
+static void secp256k1_fe_to_storage(secp256k1_fe_storage_t *r, const secp256k1_fe_t *a) {
 #ifdef VERIFY
-    if (flag) {
-        r->magnitude = a->magnitude;
-        r->normalized = a->normalized;
-    }
+    VERIFY_CHECK(a->normalized);
+#endif
+    r->n[0] = a->n[0] | a->n[1] << 26;
+    r->n[1] = a->n[1] >> 6 | a->n[2] << 20;
+    r->n[2] = a->n[2] >> 12 | a->n[3] << 14;
+    r->n[3] = a->n[3] >> 18 | a->n[4] << 8;
+    r->n[4] = a->n[4] >> 24 | a->n[5] << 2 | a->n[6] << 28;
+    r->n[5] = a->n[6] >> 4 | a->n[7] << 22;
+    r->n[6] = a->n[7] >> 10 | a->n[8] << 16;
+    r->n[7] = a->n[8] >> 16 | a->n[9] << 10;
+}
+
+static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe_t *r, const secp256k1_fe_storage_t *a) {
+    r->n[0] = a->n[0] & 0x3FFFFFFUL;
+    r->n[1] = a->n[0] >> 26 | ((a->n[1] << 6) & 0x3FFFFFFUL);
+    r->n[2] = a->n[1] >> 20 | ((a->n[2] << 12) & 0x3FFFFFFUL);
+    r->n[3] = a->n[2] >> 14 | ((a->n[3] << 18) & 0x3FFFFFFUL);
+    r->n[4] = a->n[3] >> 8 | ((a->n[4] << 24) & 0x3FFFFFFUL);
+    r->n[5] = (a->n[4] >> 2) & 0x3FFFFFFUL;
+    r->n[6] = a->n[4] >> 28 | ((a->n[5] << 4) & 0x3FFFFFFUL);
+    r->n[7] = a->n[5] >> 22 | ((a->n[6] << 10) & 0x3FFFFFFUL);
+    r->n[8] = a->n[6] >> 16 | ((a->n[7] << 16) & 0x3FFFFFFUL);
+    r->n[9] = a->n[7] >> 10;
+#ifdef VERIFY
+    r->magnitude = 1;
+    r->normalized = 1;
 #endif
 }

--- a/src/field_5x52.h
+++ b/src/field_5x52.h
@ -18,4 +18,30 @@ typedef struct {
 #endif
 } secp256k1_fe_t;

+/* Unpacks a constant into a overlapping multi-limbed FE element. */
+#define SECP256K1_FE_CONST_INNER(d7, d6, d5, d4, d3, d2, d1, d0) { \
+    (d0) | ((uint64_t)(d1) & 0xFFFFFUL) << 32, \
+    ((d1) >> 20) | ((uint64_t)(d2)) << 12 | ((uint64_t)(d3) & 0xFFUL) << 44, \
+    ((d3) >> 8) | ((uint64_t)(d4) & 0xFFFFFFFUL) << 24, \
+    ((d4) >> 28) | ((uint64_t)(d5)) << 4 | ((uint64_t)(d6) & 0xFFFFUL) << 36, \
+    ((d6) >> 16) | ((uint64_t)(d7)) << 16 \
+}
+
+#ifdef VERIFY
+#define SECP256K1_FE_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {SECP256K1_FE_CONST_INNER((d7), (d6), (d5), (d4), (d3), (d2), (d1), (d0)), 1, 1}
+#else
+#define SECP256K1_FE_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {SECP256K1_FE_CONST_INNER((d7), (d6), (d5), (d4), (d3), (d2), (d1), (d0))}
+#endif
+
+typedef struct {
+    uint64_t n[4];
+} secp256k1_fe_storage_t;
+
+#define SECP256K1_FE_STORAGE_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {{ \
+    (d0) | ((uint64_t)(d1)) << 32, \
+    (d2) | ((uint64_t)(d3)) << 32, \
+    (d4) | ((uint64_t)(d5)) << 32, \
+    (d6) | ((uint64_t)(d7)) << 32 \
+}}
+
 #endif
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@ -30,13 +30,11 @@
 *  output.
 */

-static void secp256k1_fe_inner_start(void) {}
-static void secp256k1_fe_inner_stop(void) {}
-
 #ifdef VERIFY
 static void secp256k1_fe_verify(const secp256k1_fe_t *a) {
    const uint64_t *d = a->n;
    int m = a->normalized ? 1 : 2 * a->magnitude, r = 1;
+   /* secp256k1 'p' value defined in "Standards for Efficient Cryptography" (SEC2) 2.7.1. */
    r &= (d[0] <= 0xFFFFFFFFFFFFFULL * m);
    r &= (d[1] <= 0xFFFFFFFFFFFFFULL * m);
    r &= (d[2] <= 0xFFFFFFFFFFFFFULL * m);
@ -62,8 +60,8 @@ static void secp256k1_fe_normalize(secp256k1_fe_t *r) {
    uint64_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4];

    /* Reduce t4 at the start so there will be at most a single carry from the first pass */
-    uint64_t x = t4 >> 48; t4 &= 0x0FFFFFFFFFFFFULL;
    uint64_t m;
+    uint64_t x = t4 >> 48; t4 &= 0x0FFFFFFFFFFFFULL;

    /* The first pass ensures the magnitude is 1, ... */
    t0 += x * 0x1000003D1ULL;
@ -129,8 +127,8 @@ static void secp256k1_fe_normalize_var(secp256k1_fe_t *r) {
    uint64_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4];

    /* Reduce t4 at the start so there will be at most a single carry from the first pass */
-    uint64_t x = t4 >> 48; t4 &= 0x0FFFFFFFFFFFFULL;
    uint64_t m;
+    uint64_t x = t4 >> 48; t4 &= 0x0FFFFFFFFFFFFULL;

    /* The first pass ensures the magnitude is 1, ... */
    t0 += x * 0x1000003D1ULL;
@ -172,12 +170,12 @@ static void secp256k1_fe_normalize_var(secp256k1_fe_t *r) {
 static int secp256k1_fe_normalizes_to_zero(secp256k1_fe_t *r) {
    uint64_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4];

-    /* Reduce t4 at the start so there will be at most a single carry from the first pass */
-    uint64_t x = t4 >> 48; t4 &= 0x0FFFFFFFFFFFFULL;
-
    /* z0 tracks a possible raw value of 0, z1 tracks a possible raw value of P */
    uint64_t z0, z1;

+    /* Reduce t4 at the start so there will be at most a single carry from the first pass */
+    uint64_t x = t4 >> 48; t4 &= 0x0FFFFFFFFFFFFULL;
+
    /* The first pass ensures the magnitude is 1, ... */
    t0 += x * 0x1000003D1ULL;
    t1 += (t0 >> 52); t0 &= 0xFFFFFFFFFFFFFULL; z0  = t0; z1  = t0 ^ 0x1000003D0ULL;
@ -193,22 +191,31 @@ static int secp256k1_fe_normalizes_to_zero(secp256k1_fe_t *r) {
 }

 static int secp256k1_fe_normalizes_to_zero_var(secp256k1_fe_t *r) {
-    uint64_t t0 = r->n[0], t4 = r->n[4];
+    uint64_t t0, t1, t2, t3, t4;
+    uint64_t z0, z1;
+    uint64_t x;
+
+    t0 = r->n[0];
+    t4 = r->n[4];

    /* Reduce t4 at the start so there will be at most a single carry from the first pass */
-    uint64_t x = t4 >> 48;
+    x = t4 >> 48;

    /* The first pass ensures the magnitude is 1, ... */
    t0 += x * 0x1000003D1ULL;

    /* z0 tracks a possible raw value of 0, z1 tracks a possible raw value of P */
-    uint64_t z0 = t0 & 0xFFFFFFFFFFFFFULL, z1 = z0 ^ 0x1000003D0ULL;
+    z0 = t0 & 0xFFFFFFFFFFFFFULL;
+    z1 = z0 ^ 0x1000003D0ULL;

    /* Fast return path should catch the majority of cases */
    if ((z0 != 0ULL) & (z1 != 0xFFFFFFFFFFFFFULL))
        return 0;

-    uint64_t t1 = r->n[1], t2 = r->n[2], t3 = r->n[3];
+    t1 = r->n[1];
+    t2 = r->n[2];
+    t3 = r->n[3];
+
    t4 &= 0x0FFFFFFFFFFFFULL;

    t1 += (t0 >> 52); t0  = z0;
@ -234,11 +241,11 @@ SECP256K1_INLINE static void secp256k1_fe_set_int(secp256k1_fe_t *r, int a) {
 }

 SECP256K1_INLINE static int secp256k1_fe_is_zero(const secp256k1_fe_t *a) {
+    const uint64_t *t = a->n;
 #ifdef VERIFY
    VERIFY_CHECK(a->normalized);
    secp256k1_fe_verify(a);
 #endif
-    const uint64_t *t = a->n;
    return (t[0] | t[1] | t[2] | t[3] | t[4]) == 0;
 }

@ -251,23 +258,25 @@ SECP256K1_INLINE static int secp256k1_fe_is_odd(const secp256k1_fe_t *a) {
 }

 SECP256K1_INLINE static void secp256k1_fe_clear(secp256k1_fe_t *a) {
+    int i;
 #ifdef VERIFY
    a->magnitude = 0;
    a->normalized = 1;
 #endif
-    for (int i=0; i<5; i++) {
+    for (i=0; i<5; i++) {
        a->n[i] = 0;
    }
 }

 static int secp256k1_fe_cmp_var(const secp256k1_fe_t *a, const secp256k1_fe_t *b) {
+    int i;
 #ifdef VERIFY
    VERIFY_CHECK(a->normalized);
    VERIFY_CHECK(b->normalized);
    secp256k1_fe_verify(a);
    secp256k1_fe_verify(b);
 #endif
-    for (int i = 4; i >= 0; i--) {
+    for (i = 4; i >= 0; i--) {
        if (a->n[i] > b->n[i]) return 1;
        if (a->n[i] < b->n[i]) return -1;
    }
@ -275,9 +284,11 @@ static int secp256k1_fe_cmp_var(const secp256k1_fe_t *a, const secp256k1_fe_t *b
 }

 static int secp256k1_fe_set_b32(secp256k1_fe_t *r, const unsigned char *a) {
+    int i;
    r->n[0] = r->n[1] = r->n[2] = r->n[3] = r->n[4] = 0;
-    for (int i=0; i<32; i++) {
-        for (int j=0; j<2; j++) {
+    for (i=0; i<32; i++) {
+        int j;
+        for (j=0; j<2; j++) {
            int limb = (8*i+4*j)/52;
            int shift = (8*i+4*j)%52;
            r->n[limb] |= (uint64_t)((a[31-i] >> (4*j)) & 0xF) << shift;
@ -296,13 +307,15 @@ static int secp256k1_fe_set_b32(secp256k1_fe_t *r, const unsigned char *a) {

 /** Convert a field element to a 32-byte big endian value. Requires the input to be normalized */
 static void secp256k1_fe_get_b32(unsigned char *r, const secp256k1_fe_t *a) {
+    int i;
 #ifdef VERIFY
    VERIFY_CHECK(a->normalized);
    secp256k1_fe_verify(a);
 #endif
-    for (int i=0; i<32; i++) {
+    for (i=0; i<32; i++) {
+        int j;
        int c = 0;
-        for (int j=0; j<2; j++) {
+        for (j=0; j<2; j++) {
            int limb = (8*i+4*j)/52;
            int shift = (8*i+4*j)%52;
            c |= ((a->n[limb] >> shift) & 0xF) << (4 * j);
@ -386,18 +399,35 @@ static void secp256k1_fe_sqr(secp256k1_fe_t *r, const secp256k1_fe_t *a) {
 #endif
 }

-static void secp256k1_fe_cmov(secp256k1_fe_t *r, const secp256k1_fe_t *a, int flag) {
-    uint64_t mask0 = flag + ~((uint64_t)0), mask1 = ~mask0;
+static SECP256K1_INLINE void secp256k1_fe_storage_cmov(secp256k1_fe_storage_t *r, const secp256k1_fe_storage_t *a, int flag) {
+    uint64_t mask0, mask1;
+    mask0 = flag + ~((uint64_t)0);
+    mask1 = ~mask0;
    r->n[0] = (r->n[0] & mask0) | (a->n[0] & mask1);
    r->n[1] = (r->n[1] & mask0) | (a->n[1] & mask1);
    r->n[2] = (r->n[2] & mask0) | (a->n[2] & mask1);
    r->n[3] = (r->n[3] & mask0) | (a->n[3] & mask1);
-    r->n[4] = (r->n[4] & mask0) | (a->n[4] & mask1);
+}
+
+static void secp256k1_fe_to_storage(secp256k1_fe_storage_t *r, const secp256k1_fe_t *a) {
 #ifdef VERIFY
-    if (flag) {
-        r->magnitude = a->magnitude;
-        r->normalized = a->normalized;
-    }
+    VERIFY_CHECK(a->normalized);
+#endif
+    r->n[0] = a->n[0] | a->n[1] << 52;
+    r->n[1] = a->n[1] >> 12 | a->n[2] << 40;
+    r->n[2] = a->n[2] >> 24 | a->n[3] << 28;
+    r->n[3] = a->n[3] >> 36 | a->n[4] << 16;
+}
+
+static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe_t *r, const secp256k1_fe_storage_t *a) {
+    r->n[0] = a->n[0] & 0xFFFFFFFFFFFFFULL;
+    r->n[1] = a->n[0] >> 52 | ((a->n[1] << 12) & 0xFFFFFFFFFFFFFULL);
+    r->n[2] = a->n[1] >> 40 | ((a->n[2] << 24) & 0xFFFFFFFFFFFFFULL);
+    r->n[3] = a->n[2] >> 28 | ((a->n[3] << 36) & 0xFFFFFFFFFFFFFULL);
+    r->n[4] = a->n[3] >> 16;
+#ifdef VERIFY
+    r->magnitude = 1;
+    r->normalized = 1;
 #endif
 }

--- a/src/field_5x52_int128_impl.h
+++ b/src/field_5x52_int128_impl.h
@ -16,6 +16,11 @@
 #endif

 SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) {
+    uint128_t c, d;
+    uint64_t t3, t4, tx, u0;
+    uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
+    const uint64_t M = 0xFFFFFFFFFFFFFULL, R = 0x1000003D10ULL;
+
    VERIFY_BITS(a[0], 56);
    VERIFY_BITS(a[1], 56);
    VERIFY_BITS(a[2], 56);
@ -28,63 +33,58 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t
    VERIFY_BITS(b[4], 52);
    VERIFY_CHECK(r != b);

-    const uint64_t M = 0xFFFFFFFFFFFFFULL, R = 0x1000003D10ULL;
    /*  [... a b c] is a shorthand for ... + a<<104 + b<<52 + c<<0 mod n.
     *  px is a shorthand for sum(a[i]*b[x-i], i=0..x).
     *  Note that [x 0 0 0 0 0] = [x*R].
     */

-    uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
-
-    __int128 c, d;
-
-    d  = (__int128)a0 * b[3]
-       + (__int128)a1 * b[2]
-       + (__int128)a2 * b[1]
-       + (__int128)a3 * b[0];
+    d  = (uint128_t)a0 * b[3]
+       + (uint128_t)a1 * b[2]
+       + (uint128_t)a2 * b[1]
+       + (uint128_t)a3 * b[0];
    VERIFY_BITS(d, 114);
    /* [d 0 0 0] = [p3 0 0 0] */
-    c  = (__int128)a4 * b[4];
+    c  = (uint128_t)a4 * b[4];
    VERIFY_BITS(c, 112);
    /* [c 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
    d += (c & M) * R; c >>= 52;
    VERIFY_BITS(d, 115);
    VERIFY_BITS(c, 60);
    /* [c 0 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
-    uint64_t t3 = d & M; d >>= 52;
+    t3 = d & M; d >>= 52;
    VERIFY_BITS(t3, 52);
    VERIFY_BITS(d, 63);
    /* [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */

-    d += (__int128)a0 * b[4]
-       + (__int128)a1 * b[3]
-       + (__int128)a2 * b[2]
-       + (__int128)a3 * b[1]
-       + (__int128)a4 * b[0];
+    d += (uint128_t)a0 * b[4]
+       + (uint128_t)a1 * b[3]
+       + (uint128_t)a2 * b[2]
+       + (uint128_t)a3 * b[1]
+       + (uint128_t)a4 * b[0];
    VERIFY_BITS(d, 115);
    /* [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
    d += c * R;
    VERIFY_BITS(d, 116);
    /* [d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
-    uint64_t t4 = d & M; d >>= 52;
+    t4 = d & M; d >>= 52;
    VERIFY_BITS(t4, 52);
    VERIFY_BITS(d, 64);
    /* [d t4 t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
-    uint64_t tx = (t4 >> 48); t4 &= (M >> 4);
+    tx = (t4 >> 48); t4 &= (M >> 4);
    VERIFY_BITS(tx, 4);
    VERIFY_BITS(t4, 48);
    /* [d t4+(tx<<48) t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */

-    c  = (__int128)a0 * b[0];
+    c  = (uint128_t)a0 * b[0];
    VERIFY_BITS(c, 112);
    /* [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 0 p4 p3 0 0 p0] */
-    d += (__int128)a1 * b[4]
-       + (__int128)a2 * b[3]
-       + (__int128)a3 * b[2]
-       + (__int128)a4 * b[1];
+    d += (uint128_t)a1 * b[4]
+       + (uint128_t)a2 * b[3]
+       + (uint128_t)a3 * b[2]
+       + (uint128_t)a4 * b[1];
    VERIFY_BITS(d, 115);
    /* [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
-    uint64_t u0 = d & M; d >>= 52;
+    u0 = d & M; d >>= 52;
    VERIFY_BITS(u0, 52);
    VERIFY_BITS(d, 63);
    /* [d u0 t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
@ -92,7 +92,7 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t
    u0 = (u0 << 4) | tx;
    VERIFY_BITS(u0, 56);
    /* [d 0 t4+(u0<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
-    c += (__int128)u0 * (R >> 4);
+    c += (uint128_t)u0 * (R >> 4);
    VERIFY_BITS(c, 115);
    /* [d 0 t4 t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
    r[0] = c & M; c >>= 52;
@ -100,13 +100,13 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t
    VERIFY_BITS(c, 61);
    /* [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 0 p0] */

-    c += (__int128)a0 * b[1]
-       + (__int128)a1 * b[0];
+    c += (uint128_t)a0 * b[1]
+       + (uint128_t)a1 * b[0];
    VERIFY_BITS(c, 114);
    /* [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 p1 p0] */
-    d += (__int128)a2 * b[4]
-       + (__int128)a3 * b[3]
-       + (__int128)a4 * b[2];
+    d += (uint128_t)a2 * b[4]
+       + (uint128_t)a3 * b[3]
+       + (uint128_t)a4 * b[2];
    VERIFY_BITS(d, 114);
    /* [d 0 t4 t3 0 c r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */
    c += (d & M) * R; d >>= 52;
@ -118,13 +118,13 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t
    VERIFY_BITS(c, 63);
    /* [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */

-    c += (__int128)a0 * b[2]
-       + (__int128)a1 * b[1]
-       + (__int128)a2 * b[0];
+    c += (uint128_t)a0 * b[2]
+       + (uint128_t)a1 * b[1]
+       + (uint128_t)a2 * b[0];
    VERIFY_BITS(c, 114);
    /* [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 p2 p1 p0] */
-    d += (__int128)a3 * b[4]
-       + (__int128)a4 * b[3];
+    d += (uint128_t)a3 * b[4]
+       + (uint128_t)a4 * b[3];
    VERIFY_BITS(d, 114);
    /* [d 0 0 t4 t3 c t1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
    c += (d & M) * R; d >>= 52;
@ -153,64 +153,64 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t
 }

 SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) {
+    uint128_t c, d;
+    uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
+    int64_t t3, t4, tx, u0;
+    const uint64_t M = 0xFFFFFFFFFFFFFULL, R = 0x1000003D10ULL;
+
    VERIFY_BITS(a[0], 56);
    VERIFY_BITS(a[1], 56);
    VERIFY_BITS(a[2], 56);
    VERIFY_BITS(a[3], 56);
    VERIFY_BITS(a[4], 52);

-    const uint64_t M = 0xFFFFFFFFFFFFFULL, R = 0x1000003D10ULL;
    /**  [... a b c] is a shorthand for ... + a<<104 + b<<52 + c<<0 mod n.
     *  px is a shorthand for sum(a[i]*a[x-i], i=0..x).
     *  Note that [x 0 0 0 0 0] = [x*R].
     */

-    __int128 c, d;
-
-    uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
-
-    d  = (__int128)(a0*2) * a3
-       + (__int128)(a1*2) * a2;
+    d  = (uint128_t)(a0*2) * a3
+       + (uint128_t)(a1*2) * a2;
    VERIFY_BITS(d, 114);
    /* [d 0 0 0] = [p3 0 0 0] */
-    c  = (__int128)a4 * a4;
+    c  = (uint128_t)a4 * a4;
    VERIFY_BITS(c, 112);
    /* [c 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
    d += (c & M) * R; c >>= 52;
    VERIFY_BITS(d, 115);
    VERIFY_BITS(c, 60);
    /* [c 0 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */
-    uint64_t t3 = d & M; d >>= 52;
+    t3 = d & M; d >>= 52;
    VERIFY_BITS(t3, 52);
    VERIFY_BITS(d, 63);
    /* [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */

    a4 *= 2;
-    d += (__int128)a0 * a4
-       + (__int128)(a1*2) * a3
-       + (__int128)a2 * a2;
+    d += (uint128_t)a0 * a4
+       + (uint128_t)(a1*2) * a3
+       + (uint128_t)a2 * a2;
    VERIFY_BITS(d, 115);
    /* [c 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
    d += c * R;
    VERIFY_BITS(d, 116);
    /* [d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
-    uint64_t t4 = d & M; d >>= 52;
+    t4 = d & M; d >>= 52;
    VERIFY_BITS(t4, 52);
    VERIFY_BITS(d, 64);
    /* [d t4 t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */
-    uint64_t tx = (t4 >> 48); t4 &= (M >> 4);
+    tx = (t4 >> 48); t4 &= (M >> 4);
    VERIFY_BITS(tx, 4);
    VERIFY_BITS(t4, 48);
    /* [d t4+(tx<<48) t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */

-    c  = (__int128)a0 * a0;
+    c  = (uint128_t)a0 * a0;
    VERIFY_BITS(c, 112);
    /* [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 0 p4 p3 0 0 p0] */
-    d += (__int128)a1 * a4
-       + (__int128)(a2*2) * a3;
+    d += (uint128_t)a1 * a4
+       + (uint128_t)(a2*2) * a3;
    VERIFY_BITS(d, 114);
    /* [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
-    uint64_t u0 = d & M; d >>= 52;
+    u0 = d & M; d >>= 52;
    VERIFY_BITS(u0, 52);
    VERIFY_BITS(d, 62);
    /* [d u0 t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
@ -218,7 +218,7 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t
    u0 = (u0 << 4) | tx;
    VERIFY_BITS(u0, 56);
    /* [d 0 t4+(u0<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
-    c += (__int128)u0 * (R >> 4);
+    c += (uint128_t)u0 * (R >> 4);
    VERIFY_BITS(c, 113);
    /* [d 0 t4 t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */
    r[0] = c & M; c >>= 52;
@ -227,11 +227,11 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t
    /* [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 0 p0] */

    a0 *= 2;
-    c += (__int128)a0 * a1;
+    c += (uint128_t)a0 * a1;
    VERIFY_BITS(c, 114);
    /* [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 p1 p0] */
-    d += (__int128)a2 * a4
-       + (__int128)a3 * a3;
+    d += (uint128_t)a2 * a4
+       + (uint128_t)a3 * a3;
    VERIFY_BITS(d, 114);
    /* [d 0 t4 t3 0 c r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */
    c += (d & M) * R; d >>= 52;
@ -243,11 +243,11 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t
    VERIFY_BITS(c, 63);
    /* [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */

-    c += (__int128)a0 * a2
-       + (__int128)a1 * a1;
+    c += (uint128_t)a0 * a2
+       + (uint128_t)a1 * a1;
    VERIFY_BITS(c, 114);
    /* [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 p2 p1 p0] */
-    d += (__int128)a3 * a4;
+    d += (uint128_t)a3 * a4;
    VERIFY_BITS(d, 114);
    /* [d 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
    c += (d & M) * R; d >>= 52;
--- a/src/field_impl.h
+++ b/src/field_impl.h
@ -21,49 +21,6 @@
 #error "Please select field implementation"
 #endif

-static void secp256k1_fe_get_hex(char *r, int *rlen, const secp256k1_fe_t *a) {
-    if (*rlen < 65) {
-        *rlen = 65;
-        return;
-    }
-    *rlen = 65;
-    unsigned char tmp[32];
-    secp256k1_fe_t b = *a;
-    secp256k1_fe_normalize(&b);
-    secp256k1_fe_get_b32(tmp, &b);
-    for (int i=0; i<32; i++) {
-        static const char *c = "0123456789ABCDEF";
-        r[2*i]   = c[(tmp[i] >> 4) & 0xF];
-        r[2*i+1] = c[(tmp[i]) & 0xF];
-    }
-    r[64] = 0x00;
-}
-
-static int secp256k1_fe_set_hex(secp256k1_fe_t *r, const char *a, int alen) {
-    unsigned char tmp[32] = {};
-    static const int cvt[256] = {0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,
-                                 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,
-                                 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,
-                                 0, 1, 2, 3, 4, 5, 6,7,8,9,0,0,0,0,0,0,
-                                 0,10,11,12,13,14,15,0,0,0,0,0,0,0,0,0,
-                                 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,
-                                 0,10,11,12,13,14,15,0,0,0,0,0,0,0,0,0,
-                                 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,
-                                 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,
-                                 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,
-                                 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,
-                                 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,
-                                 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,
-                                 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,
-                                 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,
-                                 0, 0, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0};
-    for (int i=0; i<32; i++) {
-        if (alen > i*2)
-            tmp[32 - alen/2 + i] = (cvt[(unsigned char)a[2*i]] << 4) + cvt[(unsigned char)a[2*i+1]];
-    }
-    return secp256k1_fe_set_b32(r, tmp);
-}
-
 SECP256K1_INLINE static int secp256k1_fe_equal_var(const secp256k1_fe_t *a, const secp256k1_fe_t *b) {
    secp256k1_fe_t na;
    secp256k1_fe_negate(&na, a, 1);
@ -72,62 +29,62 @@ SECP256K1_INLINE static int secp256k1_fe_equal_var(const secp256k1_fe_t *a, cons
 }

 static int secp256k1_fe_sqrt_var(secp256k1_fe_t *r, const secp256k1_fe_t *a) {
+    secp256k1_fe_t x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1;
+    int j;

    /** The binary representation of (p + 1)/4 has 3 blocks of 1s, with lengths in
     *  { 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
     *  1, [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
     */

-    secp256k1_fe_t x2;
    secp256k1_fe_sqr(&x2, a);
    secp256k1_fe_mul(&x2, &x2, a);

-    secp256k1_fe_t x3;
    secp256k1_fe_sqr(&x3, &x2);
    secp256k1_fe_mul(&x3, &x3, a);

-    secp256k1_fe_t x6 = x3;
-    for (int j=0; j<3; j++) secp256k1_fe_sqr(&x6, &x6);
+    x6 = x3;
+    for (j=0; j<3; j++) secp256k1_fe_sqr(&x6, &x6);
    secp256k1_fe_mul(&x6, &x6, &x3);

-    secp256k1_fe_t x9 = x6;
-    for (int j=0; j<3; j++) secp256k1_fe_sqr(&x9, &x9);
+    x9 = x6;
+    for (j=0; j<3; j++) secp256k1_fe_sqr(&x9, &x9);
    secp256k1_fe_mul(&x9, &x9, &x3);

-    secp256k1_fe_t x11 = x9;
-    for (int j=0; j<2; j++) secp256k1_fe_sqr(&x11, &x11);
+    x11 = x9;
+    for (j=0; j<2; j++) secp256k1_fe_sqr(&x11, &x11);
    secp256k1_fe_mul(&x11, &x11, &x2);

-    secp256k1_fe_t x22 = x11;
-    for (int j=0; j<11; j++) secp256k1_fe_sqr(&x22, &x22);
+    x22 = x11;
+    for (j=0; j<11; j++) secp256k1_fe_sqr(&x22, &x22);
    secp256k1_fe_mul(&x22, &x22, &x11);

-    secp256k1_fe_t x44 = x22;
-    for (int j=0; j<22; j++) secp256k1_fe_sqr(&x44, &x44);
+    x44 = x22;
+    for (j=0; j<22; j++) secp256k1_fe_sqr(&x44, &x44);
    secp256k1_fe_mul(&x44, &x44, &x22);

-    secp256k1_fe_t x88 = x44;
-    for (int j=0; j<44; j++) secp256k1_fe_sqr(&x88, &x88);
+    x88 = x44;
+    for (j=0; j<44; j++) secp256k1_fe_sqr(&x88, &x88);
    secp256k1_fe_mul(&x88, &x88, &x44);

-    secp256k1_fe_t x176 = x88;
-    for (int j=0; j<88; j++) secp256k1_fe_sqr(&x176, &x176);
+    x176 = x88;
+    for (j=0; j<88; j++) secp256k1_fe_sqr(&x176, &x176);
    secp256k1_fe_mul(&x176, &x176, &x88);

-    secp256k1_fe_t x220 = x176;
-    for (int j=0; j<44; j++) secp256k1_fe_sqr(&x220, &x220);
+    x220 = x176;
+    for (j=0; j<44; j++) secp256k1_fe_sqr(&x220, &x220);
    secp256k1_fe_mul(&x220, &x220, &x44);

-    secp256k1_fe_t x223 = x220;
-    for (int j=0; j<3; j++) secp256k1_fe_sqr(&x223, &x223);
+    x223 = x220;
+    for (j=0; j<3; j++) secp256k1_fe_sqr(&x223, &x223);
    secp256k1_fe_mul(&x223, &x223, &x3);

    /* The final result is then assembled using a sliding window over the blocks. */

-    secp256k1_fe_t t1 = x223;
-    for (int j=0; j<23; j++) secp256k1_fe_sqr(&t1, &t1);
+    t1 = x223;
+    for (j=0; j<23; j++) secp256k1_fe_sqr(&t1, &t1);
    secp256k1_fe_mul(&t1, &t1, &x22);
-    for (int j=0; j<6; j++) secp256k1_fe_sqr(&t1, &t1);
+    for (j=0; j<6; j++) secp256k1_fe_sqr(&t1, &t1);
    secp256k1_fe_mul(&t1, &t1, &x2);
    secp256k1_fe_sqr(&t1, &t1);
    secp256k1_fe_sqr(r, &t1);
@ -139,66 +96,66 @@ static int secp256k1_fe_sqrt_var(secp256k1_fe_t *r, const secp256k1_fe_t *a) {
 }

 static void secp256k1_fe_inv(secp256k1_fe_t *r, const secp256k1_fe_t *a) {
+    secp256k1_fe_t x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1;
+    int j;

    /** The binary representation of (p - 2) has 5 blocks of 1s, with lengths in
     *  { 1, 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
     *  [1], [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
     */

-    secp256k1_fe_t x2;
    secp256k1_fe_sqr(&x2, a);
    secp256k1_fe_mul(&x2, &x2, a);

-    secp256k1_fe_t x3;
    secp256k1_fe_sqr(&x3, &x2);
    secp256k1_fe_mul(&x3, &x3, a);

-    secp256k1_fe_t x6 = x3;
-    for (int j=0; j<3; j++) secp256k1_fe_sqr(&x6, &x6);
+    x6 = x3;
+    for (j=0; j<3; j++) secp256k1_fe_sqr(&x6, &x6);
    secp256k1_fe_mul(&x6, &x6, &x3);

-    secp256k1_fe_t x9 = x6;
-    for (int j=0; j<3; j++) secp256k1_fe_sqr(&x9, &x9);
+    x9 = x6;
+    for (j=0; j<3; j++) secp256k1_fe_sqr(&x9, &x9);
    secp256k1_fe_mul(&x9, &x9, &x3);

-    secp256k1_fe_t x11 = x9;
-    for (int j=0; j<2; j++) secp256k1_fe_sqr(&x11, &x11);
+    x11 = x9;
+    for (j=0; j<2; j++) secp256k1_fe_sqr(&x11, &x11);
    secp256k1_fe_mul(&x11, &x11, &x2);

-    secp256k1_fe_t x22 = x11;
-    for (int j=0; j<11; j++) secp256k1_fe_sqr(&x22, &x22);
+    x22 = x11;
+    for (j=0; j<11; j++) secp256k1_fe_sqr(&x22, &x22);
    secp256k1_fe_mul(&x22, &x22, &x11);

-    secp256k1_fe_t x44 = x22;
-    for (int j=0; j<22; j++) secp256k1_fe_sqr(&x44, &x44);
+    x44 = x22;
+    for (j=0; j<22; j++) secp256k1_fe_sqr(&x44, &x44);
    secp256k1_fe_mul(&x44, &x44, &x22);

-    secp256k1_fe_t x88 = x44;
-    for (int j=0; j<44; j++) secp256k1_fe_sqr(&x88, &x88);
+    x88 = x44;
+    for (j=0; j<44; j++) secp256k1_fe_sqr(&x88, &x88);
    secp256k1_fe_mul(&x88, &x88, &x44);

-    secp256k1_fe_t x176 = x88;
-    for (int j=0; j<88; j++) secp256k1_fe_sqr(&x176, &x176);
+    x176 = x88;
+    for (j=0; j<88; j++) secp256k1_fe_sqr(&x176, &x176);
    secp256k1_fe_mul(&x176, &x176, &x88);

-    secp256k1_fe_t x220 = x176;
-    for (int j=0; j<44; j++) secp256k1_fe_sqr(&x220, &x220);
+    x220 = x176;
+    for (j=0; j<44; j++) secp256k1_fe_sqr(&x220, &x220);
    secp256k1_fe_mul(&x220, &x220, &x44);

-    secp256k1_fe_t x223 = x220;
-    for (int j=0; j<3; j++) secp256k1_fe_sqr(&x223, &x223);
+    x223 = x220;
+    for (j=0; j<3; j++) secp256k1_fe_sqr(&x223, &x223);
    secp256k1_fe_mul(&x223, &x223, &x3);

    /* The final result is then assembled using a sliding window over the blocks. */

-    secp256k1_fe_t t1 = x223;
-    for (int j=0; j<23; j++) secp256k1_fe_sqr(&t1, &t1);
+    t1 = x223;
+    for (j=0; j<23; j++) secp256k1_fe_sqr(&t1, &t1);
    secp256k1_fe_mul(&t1, &t1, &x22);
-    for (int j=0; j<5; j++) secp256k1_fe_sqr(&t1, &t1);
+    for (j=0; j<5; j++) secp256k1_fe_sqr(&t1, &t1);
    secp256k1_fe_mul(&t1, &t1, a);
-    for (int j=0; j<3; j++) secp256k1_fe_sqr(&t1, &t1);
+    for (j=0; j<3; j++) secp256k1_fe_sqr(&t1, &t1);
    secp256k1_fe_mul(&t1, &t1, &x2);
-    for (int j=0; j<2; j++) secp256k1_fe_sqr(&t1, &t1);
+    for (j=0; j<2; j++) secp256k1_fe_sqr(&t1, &t1);
    secp256k1_fe_mul(r, a, &t1);
 }

@ -206,13 +163,21 @@ static void secp256k1_fe_inv_var(secp256k1_fe_t *r, const secp256k1_fe_t *a) {
 #if defined(USE_FIELD_INV_BUILTIN)
    secp256k1_fe_inv(r, a);
 #elif defined(USE_FIELD_INV_NUM)
+    secp256k1_num_t n, m;
+    /* secp256k1 field prime, value p defined in "Standards for Efficient Cryptography" (SEC2) 2.7.1. */
+    static const unsigned char prime[32] = {
+        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+        0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFC,0x2F
+    };
    unsigned char b[32];
    secp256k1_fe_t c = *a;
    secp256k1_fe_normalize_var(&c);
    secp256k1_fe_get_b32(b, &c);
-    secp256k1_num_t n;
    secp256k1_num_set_bin(&n, b, 32);
-    secp256k1_num_mod_inverse(&n, &n, &secp256k1_fe_consts->p);
+    secp256k1_num_set_bin(&m, prime, 32);
+    secp256k1_num_mod_inverse(&n, &n, &m);
    secp256k1_num_get_bin(b, 32, &n);
    VERIFY_CHECK(secp256k1_fe_set_b32(r, b));
 #else
@ -220,7 +185,9 @@ static void secp256k1_fe_inv_var(secp256k1_fe_t *r, const secp256k1_fe_t *a) {
 #endif
 }

-static void secp256k1_fe_inv_all_var(size_t len, secp256k1_fe_t r[len], const secp256k1_fe_t a[len]) {
+static void secp256k1_fe_inv_all_var(size_t len, secp256k1_fe_t *r, const secp256k1_fe_t *a) {
+    secp256k1_fe_t u;
+    size_t i;
    if (len < 1)
        return;

@ -228,12 +195,12 @@ static void secp256k1_fe_inv_all_var(size_t len, secp256k1_fe_t r[len], const se

    r[0] = a[0];

-    size_t i = 0;
+    i = 0;
    while (++i < len) {
        secp256k1_fe_mul(&r[i], &r[i - 1], &a[i]);
    }

-    secp256k1_fe_t u; secp256k1_fe_inv_var(&u, &r[--i]);
+    secp256k1_fe_inv_var(&u, &r[--i]);

    while (i > 0) {
        int j = i--;
@ -244,32 +211,4 @@ static void secp256k1_fe_inv_all_var(size_t len, secp256k1_fe_t r[len], const se
    r[0] = u;
 }

-static void secp256k1_fe_start(void) {
-#ifndef USE_NUM_NONE
-    static const unsigned char secp256k1_fe_consts_p[] = {
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFC,0x2F
-    };
-#endif
-    if (secp256k1_fe_consts == NULL) {
-        secp256k1_fe_inner_start();
-        secp256k1_fe_consts_t *ret = (secp256k1_fe_consts_t*)checked_malloc(sizeof(secp256k1_fe_consts_t));
-#ifndef USE_NUM_NONE
-        secp256k1_num_set_bin(&ret->p, secp256k1_fe_consts_p, sizeof(secp256k1_fe_consts_p));
-#endif
-        secp256k1_fe_consts = ret;
-    }
-}
-
-static void secp256k1_fe_stop(void) {
-    if (secp256k1_fe_consts != NULL) {
-        secp256k1_fe_consts_t *c = (secp256k1_fe_consts_t*)secp256k1_fe_consts;
-        free((void*)c);
-        secp256k1_fe_consts = NULL;
-        secp256k1_fe_inner_stop();
-    }
-}
-
 #endif
--- a/src/group.h
+++ b/src/group.h
@ -17,6 +17,9 @@ typedef struct {
    int infinity; /* whether this represents the point at infinity */
 } secp256k1_ge_t;

+#define SECP256K1_GE_CONST(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) {SECP256K1_FE_CONST((a),(b),(c),(d),(e),(f),(g),(h)), SECP256K1_FE_CONST((i),(j),(k),(l),(m),(n),(o),(p)), 0}
+#define SECP256K1_GE_CONST_INFINITY {SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 0), SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 0), 1}
+
 /** A group element of the secp256k1 curve, in jacobian coordinates. */
 typedef struct {
    secp256k1_fe_t x; /* actual X: x/z^2 */
@ -25,23 +28,15 @@ typedef struct {
    int infinity; /* whether this represents the point at infinity */
 } secp256k1_gej_t;

-/** Global constants related to the group */
-typedef struct {
-    secp256k1_ge_t g; /* the generator point */
-
-#ifdef USE_ENDOMORPHISM
-    /* constants related to secp256k1's efficiently computable endomorphism */
-    secp256k1_fe_t beta;
-#endif
-} secp256k1_ge_consts_t;
-
-static const secp256k1_ge_consts_t *secp256k1_ge_consts = NULL;
+#define SECP256K1_GEJ_CONST(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) {SECP256K1_FE_CONST((a),(b),(c),(d),(e),(f),(g),(h)), SECP256K1_FE_CONST((i),(j),(k),(l),(m),(n),(o),(p)), SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 1), 0}
+#define SECP256K1_GEJ_CONST_INFINITY {SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 0), SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 0), SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 0), 1}

-/** Initialize the group module. */
-static void secp256k1_ge_start(void);
+typedef struct {
+    secp256k1_fe_storage_t x;
+    secp256k1_fe_storage_t y;
+} secp256k1_ge_storage_t;

-/** De-initialize the group module. */
-static void secp256k1_ge_stop(void);
+#define SECP256K1_GE_STORAGE_CONST(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) {SECP256K1_FE_STORAGE_CONST((a),(b),(c),(d),(e),(f),(g),(h)), SECP256K1_FE_STORAGE_CONST((i),(j),(k),(l),(m),(n),(o),(p))}

 /** Set a group element equal to the point at infinity */
 static void secp256k1_ge_set_infinity(secp256k1_ge_t *r);
@ -61,14 +56,11 @@ static int secp256k1_ge_is_valid_var(const secp256k1_ge_t *a);

 static void secp256k1_ge_neg(secp256k1_ge_t *r, const secp256k1_ge_t *a);

-/** Get a hex representation of a point. *rlen will be overwritten with the real length. */
-static void secp256k1_ge_get_hex(char *r, int *rlen, const secp256k1_ge_t *a);
-
 /** Set a group element equal to another which is given in jacobian coordinates */
 static void secp256k1_ge_set_gej(secp256k1_ge_t *r, secp256k1_gej_t *a);

 /** Set a batch of group elements equal to the inputs given in jacobian coordinates */
-static void secp256k1_ge_set_all_gej_var(size_t len, secp256k1_ge_t r[len], const secp256k1_gej_t a[len]);
+static void secp256k1_ge_set_all_gej_var(size_t len, secp256k1_ge_t *r, const secp256k1_gej_t *a);


 /** Set a group element (jacobian) equal to the point at infinity. */
@ -103,9 +95,6 @@ static void secp256k1_gej_add_ge(secp256k1_gej_t *r, const secp256k1_gej_t *a, c
    guarantee, and b is allowed to be infinity. */
 static void secp256k1_gej_add_ge_var(secp256k1_gej_t *r, const secp256k1_gej_t *a, const secp256k1_ge_t *b);

-/** Get a hex representation of a point. *rlen will be overwritten with the real length. */
-static void secp256k1_gej_get_hex(char *r, int *rlen, const secp256k1_gej_t *a);
-
 #ifdef USE_ENDOMORPHISM
 /** Set r to be equal to lambda times a, where lambda is chosen in a way such that this is very fast. */
 static void secp256k1_gej_mul_lambda(secp256k1_gej_t *r, const secp256k1_gej_t *a);
@ -117,4 +106,13 @@ static void secp256k1_gej_clear(secp256k1_gej_t *r);
 /** Clear a secp256k1_ge_t to prevent leaking sensitive information. */
 static void secp256k1_ge_clear(secp256k1_ge_t *r);

+/** Convert a group element to the storage type. */
+static void secp256k1_ge_to_storage(secp256k1_ge_storage_t *r, const secp256k1_ge_t*);
+
+/** Convert a group element back from the storage type. */
+static void secp256k1_ge_from_storage(secp256k1_ge_t *r, const secp256k1_ge_storage_t*);
+
+/** If flag is true, set *r equal to *a; otherwise leave it. Constant-time. */
+static void secp256k1_ge_storage_cmov(secp256k1_ge_storage_t *r, const secp256k1_ge_storage_t *a, int flag);
+
 #endif
--- a/src/group_impl.h
+++ b/src/group_impl.h
@ -13,6 +13,16 @@
 #include "field.h"
 #include "group.h"

+/** Generator for secp256k1, value 'g' defined in
+ *  "Standards for Efficient Cryptography" (SEC2) 2.7.1.
+ */
+static const secp256k1_ge_t secp256k1_ge_const_g = SECP256K1_GE_CONST(
+    0x79BE667EUL, 0xF9DCBBACUL, 0x55A06295UL, 0xCE870B07UL,
+    0x029BFCDBUL, 0x2DCE28D9UL, 0x59F2815BUL, 0x16F81798UL,
+    0x483ADA77UL, 0x26A3C465UL, 0x5DA4FBFCUL, 0x0E1108A8UL,
+    0xFD17B448UL, 0xA6855419UL, 0x9C47D08FUL, 0xFB10D4B8UL
+);
+
 static void secp256k1_ge_set_infinity(secp256k1_ge_t *r) {
    r->infinity = 1;
 }
@ -33,32 +43,12 @@ static void secp256k1_ge_neg(secp256k1_ge_t *r, const secp256k1_ge_t *a) {
    secp256k1_fe_negate(&r->y, &r->y, 1);
 }

-static void secp256k1_ge_get_hex(char *r, int *rlen, const secp256k1_ge_t *a) {
-    char cx[65]; int lx=65;
-    char cy[65]; int ly=65;
-    secp256k1_fe_get_hex(cx, &lx, &a->x);
-    secp256k1_fe_get_hex(cy, &ly, &a->y);
-    lx = strlen(cx);
-    ly = strlen(cy);
-    int len = lx + ly + 3 + 1;
-    if (*rlen < len) {
-        *rlen = len;
-        return;
-    }
-    *rlen = len;
-    r[0] = '(';
-    memcpy(r+1, cx, lx);
-    r[1+lx] = ',';
-    memcpy(r+2+lx, cy, ly);
-    r[2+lx+ly] = ')';
-    r[3+lx+ly] = 0;
-}
-
 static void secp256k1_ge_set_gej(secp256k1_ge_t *r, secp256k1_gej_t *a) {
+    secp256k1_fe_t z2, z3;
    r->infinity = a->infinity;
    secp256k1_fe_inv(&a->z, &a->z);
-    secp256k1_fe_t z2; secp256k1_fe_sqr(&z2, &a->z);
-    secp256k1_fe_t z3; secp256k1_fe_mul(&z3, &a->z, &z2);
+    secp256k1_fe_sqr(&z2, &a->z);
+    secp256k1_fe_mul(&z3, &a->z, &z2);
    secp256k1_fe_mul(&a->x, &a->x, &z2);
    secp256k1_fe_mul(&a->y, &a->y, &z3);
    secp256k1_fe_set_int(&a->z, 1);
@ -67,13 +57,14 @@ static void secp256k1_ge_set_gej(secp256k1_ge_t *r, secp256k1_gej_t *a) {
 }

 static void secp256k1_ge_set_gej_var(secp256k1_ge_t *r, secp256k1_gej_t *a) {
+    secp256k1_fe_t z2, z3;
    r->infinity = a->infinity;
    if (a->infinity) {
        return;
    }
    secp256k1_fe_inv_var(&a->z, &a->z);
-    secp256k1_fe_t z2; secp256k1_fe_sqr(&z2, &a->z);
-    secp256k1_fe_t z3; secp256k1_fe_mul(&z3, &a->z, &z2);
+    secp256k1_fe_sqr(&z2, &a->z);
+    secp256k1_fe_mul(&z3, &a->z, &z2);
    secp256k1_fe_mul(&a->x, &a->x, &z2);
    secp256k1_fe_mul(&a->y, &a->y, &z3);
    secp256k1_fe_set_int(&a->z, 1);
@ -81,26 +72,30 @@ static void secp256k1_ge_set_gej_var(secp256k1_ge_t *r, secp256k1_gej_t *a) {
    r->y = a->y;
 }

-static void secp256k1_ge_set_all_gej_var(size_t len, secp256k1_ge_t r[len], const secp256k1_gej_t a[len]) {
+static void secp256k1_ge_set_all_gej_var(size_t len, secp256k1_ge_t *r, const secp256k1_gej_t *a) {
+    secp256k1_fe_t *az;
+    secp256k1_fe_t *azi;
+    size_t i;
    size_t count = 0;
-    secp256k1_fe_t *az = checked_malloc(sizeof(secp256k1_fe_t) * len);
-    for (size_t i=0; i<len; i++) {
+    az = checked_malloc(sizeof(secp256k1_fe_t) * len);
+    for (i = 0; i < len; i++) {
        if (!a[i].infinity) {
            az[count++] = a[i].z;
        }
    }

-    secp256k1_fe_t *azi = checked_malloc(sizeof(secp256k1_fe_t) * count);
+    azi = checked_malloc(sizeof(secp256k1_fe_t) * count);
    secp256k1_fe_inv_all_var(count, azi, az);
    free(az);

    count = 0;
-    for (size_t i=0; i<len; i++) {
+    for (i = 0; i < len; i++) {
        r[i].infinity = a[i].infinity;
        if (!a[i].infinity) {
+            secp256k1_fe_t zi2, zi3;
            secp256k1_fe_t *zi = &azi[count++];
-            secp256k1_fe_t zi2; secp256k1_fe_sqr(&zi2, zi);
-            secp256k1_fe_t zi3; secp256k1_fe_mul(&zi3, &zi2, zi);
+            secp256k1_fe_sqr(&zi2, zi);
+            secp256k1_fe_mul(&zi3, &zi2, zi);
            secp256k1_fe_mul(&r[i].x, &a[i].x, &zi2);
            secp256k1_fe_mul(&r[i].y, &a[i].y, &zi3);
        }
@ -136,11 +131,12 @@ static void secp256k1_ge_clear(secp256k1_ge_t *r) {
 }

 static int secp256k1_ge_set_xo_var(secp256k1_ge_t *r, const secp256k1_fe_t *x, int odd) {
+    secp256k1_fe_t x2, x3, c;
    r->x = *x;
-    secp256k1_fe_t x2; secp256k1_fe_sqr(&x2, x);
-    secp256k1_fe_t x3; secp256k1_fe_mul(&x3, x, &x2);
+    secp256k1_fe_sqr(&x2, x);
+    secp256k1_fe_mul(&x3, x, &x2);
    r->infinity = 0;
-    secp256k1_fe_t c; secp256k1_fe_set_int(&c, 7);
+    secp256k1_fe_set_int(&c, 7);
    secp256k1_fe_add(&c, &x3);
    if (!secp256k1_fe_sqrt_var(&r->y, &c))
        return 0;
@ -158,9 +154,10 @@ static void secp256k1_gej_set_ge(secp256k1_gej_t *r, const secp256k1_ge_t *a) {
 }

 static int secp256k1_gej_eq_x_var(const secp256k1_fe_t *x, const secp256k1_gej_t *a) {
+    secp256k1_fe_t r, r2;
    VERIFY_CHECK(!a->infinity);
-    secp256k1_fe_t r; secp256k1_fe_sqr(&r, &a->z); secp256k1_fe_mul(&r, &r, x);
-    secp256k1_fe_t r2 = a->x; secp256k1_fe_normalize_weak(&r2);
+    secp256k1_fe_sqr(&r, &a->z); secp256k1_fe_mul(&r, &r, x);
+    r2 = a->x; secp256k1_fe_normalize_weak(&r2);
    return secp256k1_fe_equal_var(&r, &r2);
 }

@ -178,6 +175,7 @@ static int secp256k1_gej_is_infinity(const secp256k1_gej_t *a) {
 }

 static int secp256k1_gej_is_valid_var(const secp256k1_gej_t *a) {
+    secp256k1_fe_t y2, x3, z2, z6;
    if (a->infinity)
        return 0;
    /** y^2 = x^3 + 7
@ -185,10 +183,10 @@ static int secp256k1_gej_is_valid_var(const secp256k1_gej_t *a) {
     *  Y^2 / Z^6 = X^3 / Z^6 + 7
     *  Y^2 = X^3 + 7*Z^6
     */
-    secp256k1_fe_t y2; secp256k1_fe_sqr(&y2, &a->y);
-    secp256k1_fe_t x3; secp256k1_fe_sqr(&x3, &a->x); secp256k1_fe_mul(&x3, &x3, &a->x);
-    secp256k1_fe_t z2; secp256k1_fe_sqr(&z2, &a->z);
-    secp256k1_fe_t z6; secp256k1_fe_sqr(&z6, &z2); secp256k1_fe_mul(&z6, &z6, &z2);
+    secp256k1_fe_sqr(&y2, &a->y);
+    secp256k1_fe_sqr(&x3, &a->x); secp256k1_fe_mul(&x3, &x3, &a->x);
+    secp256k1_fe_sqr(&z2, &a->z);
+    secp256k1_fe_sqr(&z6, &z2); secp256k1_fe_mul(&z6, &z6, &z2);
    secp256k1_fe_mul_int(&z6, 7);
    secp256k1_fe_add(&x3, &z6);
    secp256k1_fe_normalize_weak(&x3);
@ -196,27 +194,30 @@ static int secp256k1_gej_is_valid_var(const secp256k1_gej_t *a) {
 }

 static int secp256k1_ge_is_valid_var(const secp256k1_ge_t *a) {
+    secp256k1_fe_t y2, x3, c;
    if (a->infinity)
        return 0;
    /* y^2 = x^3 + 7 */
-    secp256k1_fe_t y2; secp256k1_fe_sqr(&y2, &a->y);
-    secp256k1_fe_t x3; secp256k1_fe_sqr(&x3, &a->x); secp256k1_fe_mul(&x3, &x3, &a->x);
-    secp256k1_fe_t c; secp256k1_fe_set_int(&c, 7);
+    secp256k1_fe_sqr(&y2, &a->y);
+    secp256k1_fe_sqr(&x3, &a->x); secp256k1_fe_mul(&x3, &x3, &a->x);
+    secp256k1_fe_set_int(&c, 7);
    secp256k1_fe_add(&x3, &c);
    secp256k1_fe_normalize_weak(&x3);
    return secp256k1_fe_equal_var(&y2, &x3);
 }

 static void secp256k1_gej_double_var(secp256k1_gej_t *r, const secp256k1_gej_t *a) {
-    // For secp256k1, 2Q is infinity if and only if Q is infinity. This is because if 2Q = infinity,
-    // Q must equal -Q, or that Q.y == -(Q.y), or Q.y is 0. For a point on y^2 = x^3 + 7 to have
-    // y=0, x^3 must be -7 mod p. However, -7 has no cube root mod p.
+    /* Operations: 3 mul, 4 sqr, 0 normalize, 12 mul_int/add/negate */
+    secp256k1_fe_t t1,t2,t3,t4;
+    /** For secp256k1, 2Q is infinity if and only if Q is infinity. This is because if 2Q = infinity,
+     *  Q must equal -Q, or that Q.y == -(Q.y), or Q.y is 0. For a point on y^2 = x^3 + 7 to have
+     *  y=0, x^3 must be -7 mod p. However, -7 has no cube root mod p.
+     */
    r->infinity = a->infinity;
    if (r->infinity) {
        return;
    }

-    secp256k1_fe_t t1,t2,t3,t4;
    secp256k1_fe_mul(&r->z, &a->z, &a->y);
    secp256k1_fe_mul_int(&r->z, 2);       /* Z' = 2*Y*Z (2) */
    secp256k1_fe_sqr(&t1, &a->x);
@ -240,6 +241,8 @@ static void secp256k1_gej_double_var(secp256k1_gej_t *r, const secp256k1_gej_t *
 }

 static void secp256k1_gej_add_var(secp256k1_gej_t *r, const secp256k1_gej_t *a, const secp256k1_gej_t *b) {
+    /* Operations: 12 mul, 4 sqr, 2 normalize, 12 mul_int/add/negate */
+    secp256k1_fe_t z22, z12, u1, u2, s1, s2, h, i, i2, h2, h3, t;
    if (a->infinity) {
        *r = *b;
        return;
@ -249,14 +252,14 @@ static void secp256k1_gej_add_var(secp256k1_gej_t *r, const secp256k1_gej_t *a,
        return;
    }
    r->infinity = 0;
-    secp256k1_fe_t z22; secp256k1_fe_sqr(&z22, &b->z);
-    secp256k1_fe_t z12; secp256k1_fe_sqr(&z12, &a->z);
-    secp256k1_fe_t u1; secp256k1_fe_mul(&u1, &a->x, &z22);
-    secp256k1_fe_t u2; secp256k1_fe_mul(&u2, &b->x, &z12);
-    secp256k1_fe_t s1; secp256k1_fe_mul(&s1, &a->y, &z22); secp256k1_fe_mul(&s1, &s1, &b->z);
-    secp256k1_fe_t s2; secp256k1_fe_mul(&s2, &b->y, &z12); secp256k1_fe_mul(&s2, &s2, &a->z);
-    secp256k1_fe_t h; secp256k1_fe_negate(&h, &u1, 1); secp256k1_fe_add(&h, &u2);
-    secp256k1_fe_t i; secp256k1_fe_negate(&i, &s1, 1); secp256k1_fe_add(&i, &s2);
+    secp256k1_fe_sqr(&z22, &b->z);
+    secp256k1_fe_sqr(&z12, &a->z);
+    secp256k1_fe_mul(&u1, &a->x, &z22);
+    secp256k1_fe_mul(&u2, &b->x, &z12);
+    secp256k1_fe_mul(&s1, &a->y, &z22); secp256k1_fe_mul(&s1, &s1, &b->z);
+    secp256k1_fe_mul(&s2, &b->y, &z12); secp256k1_fe_mul(&s2, &s2, &a->z);
+    secp256k1_fe_negate(&h, &u1, 1); secp256k1_fe_add(&h, &u2);
+    secp256k1_fe_negate(&i, &s1, 1); secp256k1_fe_add(&i, &s2);
    if (secp256k1_fe_normalizes_to_zero_var(&h)) {
        if (secp256k1_fe_normalizes_to_zero_var(&i)) {
            secp256k1_gej_double_var(r, a);
@ -265,11 +268,11 @@ static void secp256k1_gej_add_var(secp256k1_gej_t *r, const secp256k1_gej_t *a,
        }
        return;
    }
-    secp256k1_fe_t i2; secp256k1_fe_sqr(&i2, &i);
-    secp256k1_fe_t h2; secp256k1_fe_sqr(&h2, &h);
-    secp256k1_fe_t h3; secp256k1_fe_mul(&h3, &h, &h2);
+    secp256k1_fe_sqr(&i2, &i);
+    secp256k1_fe_sqr(&h2, &h);
+    secp256k1_fe_mul(&h3, &h, &h2);
    secp256k1_fe_mul(&r->z, &a->z, &b->z); secp256k1_fe_mul(&r->z, &r->z, &h);
-    secp256k1_fe_t t; secp256k1_fe_mul(&t, &u1, &h2);
+    secp256k1_fe_mul(&t, &u1, &h2);
    r->x = t; secp256k1_fe_mul_int(&r->x, 2); secp256k1_fe_add(&r->x, &h3); secp256k1_fe_negate(&r->x, &r->x, 3); secp256k1_fe_add(&r->x, &i2);
    secp256k1_fe_negate(&r->y, &r->x, 5); secp256k1_fe_add(&r->y, &t); secp256k1_fe_mul(&r->y, &r->y, &i);
    secp256k1_fe_mul(&h3, &h3, &s1); secp256k1_fe_negate(&h3, &h3, 1);
@ -277,6 +280,8 @@ static void secp256k1_gej_add_var(secp256k1_gej_t *r, const secp256k1_gej_t *a,
 }

 static void secp256k1_gej_add_ge_var(secp256k1_gej_t *r, const secp256k1_gej_t *a, const secp256k1_ge_t *b) {
+    /* 8 mul, 3 sqr, 4 normalize, 12 mul_int/add/negate */
+    secp256k1_fe_t z12, u1, u2, s1, s2, h, i, i2, h2, h3, t;
    if (a->infinity) {
        r->infinity = b->infinity;
        r->x = b->x;
@ -289,13 +294,13 @@ static void secp256k1_gej_add_ge_var(secp256k1_gej_t *r, const secp256k1_gej_t *
        return;
    }
    r->infinity = 0;
-    secp256k1_fe_t z12; secp256k1_fe_sqr(&z12, &a->z);
-    secp256k1_fe_t u1 = a->x; secp256k1_fe_normalize_weak(&u1);
-    secp256k1_fe_t u2; secp256k1_fe_mul(&u2, &b->x, &z12);
-    secp256k1_fe_t s1 = a->y; secp256k1_fe_normalize_weak(&s1);
-    secp256k1_fe_t s2; secp256k1_fe_mul(&s2, &b->y, &z12); secp256k1_fe_mul(&s2, &s2, &a->z);
-    secp256k1_fe_t h; secp256k1_fe_negate(&h, &u1, 1); secp256k1_fe_add(&h, &u2);
-    secp256k1_fe_t i; secp256k1_fe_negate(&i, &s1, 1); secp256k1_fe_add(&i, &s2);
+    secp256k1_fe_sqr(&z12, &a->z);
+    u1 = a->x; secp256k1_fe_normalize_weak(&u1);
+    secp256k1_fe_mul(&u2, &b->x, &z12);
+    s1 = a->y; secp256k1_fe_normalize_weak(&s1);
+    secp256k1_fe_mul(&s2, &b->y, &z12); secp256k1_fe_mul(&s2, &s2, &a->z);
+    secp256k1_fe_negate(&h, &u1, 1); secp256k1_fe_add(&h, &u2);
+    secp256k1_fe_negate(&i, &s1, 1); secp256k1_fe_add(&i, &s2);
    if (secp256k1_fe_normalizes_to_zero_var(&h)) {
        if (secp256k1_fe_normalizes_to_zero_var(&i)) {
            secp256k1_gej_double_var(r, a);
@ -304,11 +309,11 @@ static void secp256k1_gej_add_ge_var(secp256k1_gej_t *r, const secp256k1_gej_t *
        }
        return;
    }
-    secp256k1_fe_t i2; secp256k1_fe_sqr(&i2, &i);
-    secp256k1_fe_t h2; secp256k1_fe_sqr(&h2, &h);
-    secp256k1_fe_t h3; secp256k1_fe_mul(&h3, &h, &h2);
+    secp256k1_fe_sqr(&i2, &i);
+    secp256k1_fe_sqr(&h2, &h);
+    secp256k1_fe_mul(&h3, &h, &h2);
    r->z = a->z; secp256k1_fe_mul(&r->z, &r->z, &h);
-    secp256k1_fe_t t; secp256k1_fe_mul(&t, &u1, &h2);
+    secp256k1_fe_mul(&t, &u1, &h2);
    r->x = t; secp256k1_fe_mul_int(&r->x, 2); secp256k1_fe_add(&r->x, &h3); secp256k1_fe_negate(&r->x, &r->x, 3); secp256k1_fe_add(&r->x, &i2);
    secp256k1_fe_negate(&r->y, &r->x, 5); secp256k1_fe_add(&r->y, &t); secp256k1_fe_mul(&r->y, &r->y, &i);
    secp256k1_fe_mul(&h3, &h3, &s1); secp256k1_fe_negate(&h3, &h3, 1);
@ -316,6 +321,9 @@ static void secp256k1_gej_add_ge_var(secp256k1_gej_t *r, const secp256k1_gej_t *
 }

 static void secp256k1_gej_add_ge(secp256k1_gej_t *r, const secp256k1_gej_t *a, const secp256k1_ge_t *b) {
+    /* Operations: 7 mul, 5 sqr, 5 normalize, 19 mul_int/add/negate */
+    secp256k1_fe_t zz, u1, u2, s1, s2, z, t, m, n, q, rr;
+    int infinity;
    VERIFY_CHECK(!b->infinity);
    VERIFY_CHECK(a->infinity == 0 || a->infinity == 1);

@ -341,24 +349,24 @@ static void secp256k1_gej_add_ge(secp256k1_gej_t *r, const secp256k1_gej_t *a, c
     *  (Note that the paper uses xi = Xi / Zi and yi = Yi / Zi instead.)
     */

-    secp256k1_fe_t zz; secp256k1_fe_sqr(&zz, &a->z);                /* z = Z1^2 */
-    secp256k1_fe_t u1 = a->x; secp256k1_fe_normalize_weak(&u1);     /* u1 = U1 = X1*Z2^2 (1) */
-    secp256k1_fe_t u2; secp256k1_fe_mul(&u2, &b->x, &zz);           /* u2 = U2 = X2*Z1^2 (1) */
-    secp256k1_fe_t s1 = a->y; secp256k1_fe_normalize_weak(&s1);     /* s1 = S1 = Y1*Z2^3 (1) */
-    secp256k1_fe_t s2; secp256k1_fe_mul(&s2, &b->y, &zz);           /* s2 = Y2*Z2^2 (1) */
-    secp256k1_fe_mul(&s2, &s2, &a->z);                              /* s2 = S2 = Y2*Z1^3 (1) */
-    secp256k1_fe_t z = a->z;                                        /* z = Z = Z1*Z2 (8) */
-    secp256k1_fe_t t = u1; secp256k1_fe_add(&t, &u2);               /* t = T = U1+U2 (2) */
-    secp256k1_fe_t m = s1; secp256k1_fe_add(&m, &s2);               /* m = M = S1+S2 (2) */
-    secp256k1_fe_t n; secp256k1_fe_sqr(&n, &m);                     /* n = M^2 (1) */
-    secp256k1_fe_t q; secp256k1_fe_mul(&q, &n, &t);                 /* q = Q = T*M^2 (1) */
-    secp256k1_fe_sqr(&n, &n);                                       /* n = M^4 (1) */
-    secp256k1_fe_t rr; secp256k1_fe_sqr(&rr, &t);                   /* rr = T^2 (1) */
+    secp256k1_fe_sqr(&zz, &a->z);                       /* z = Z1^2 */
+    u1 = a->x; secp256k1_fe_normalize_weak(&u1);        /* u1 = U1 = X1*Z2^2 (1) */
+    secp256k1_fe_mul(&u2, &b->x, &zz);                  /* u2 = U2 = X2*Z1^2 (1) */
+    s1 = a->y; secp256k1_fe_normalize_weak(&s1);        /* s1 = S1 = Y1*Z2^3 (1) */
+    secp256k1_fe_mul(&s2, &b->y, &zz);                  /* s2 = Y2*Z2^2 (1) */
+    secp256k1_fe_mul(&s2, &s2, &a->z);                  /* s2 = S2 = Y2*Z1^3 (1) */
+    z = a->z;                                           /* z = Z = Z1*Z2 (8) */
+    t = u1; secp256k1_fe_add(&t, &u2);                  /* t = T = U1+U2 (2) */
+    m = s1; secp256k1_fe_add(&m, &s2);                  /* m = M = S1+S2 (2) */
+    secp256k1_fe_sqr(&n, &m);                           /* n = M^2 (1) */
+    secp256k1_fe_mul(&q, &n, &t);                       /* q = Q = T*M^2 (1) */
+    secp256k1_fe_sqr(&n, &n);                           /* n = M^4 (1) */
+    secp256k1_fe_sqr(&rr, &t);                          /* rr = T^2 (1) */
    secp256k1_fe_mul(&t, &u1, &u2); secp256k1_fe_negate(&t, &t, 1); /* t = -U1*U2 (2) */
    secp256k1_fe_add(&rr, &t);                                      /* rr = R = T^2-U1*U2 (3) */
    secp256k1_fe_sqr(&t, &rr);                                      /* t = R^2 (1) */
    secp256k1_fe_mul(&r->z, &m, &z);                                /* r->z = M*Z (1) */
-    int infinity = secp256k1_fe_normalizes_to_zero(&r->z) * (1 - a->infinity);
+    infinity = secp256k1_fe_normalizes_to_zero(&r->z) * (1 - a->infinity);
    secp256k1_fe_mul_int(&r->z, 2 * (1 - a->infinity)); /* r->z = Z3 = 2*M*Z (2) */
    r->x = t;                                           /* r->x = R^2 (1) */
    secp256k1_fe_negate(&q, &q, 1);                     /* q = -Q (2) */
@ -386,63 +394,37 @@ static void secp256k1_gej_add_ge(secp256k1_gej_t *r, const secp256k1_gej_t *a, c
    r->infinity = infinity;
 }

+static void secp256k1_ge_to_storage(secp256k1_ge_storage_t *r, const secp256k1_ge_t *a) {
+    secp256k1_fe_t x, y;
+    VERIFY_CHECK(!a->infinity);
+    x = a->x;
+    secp256k1_fe_normalize(&x);
+    y = a->y;
+    secp256k1_fe_normalize(&y);
+    secp256k1_fe_to_storage(&r->x, &x);
+    secp256k1_fe_to_storage(&r->y, &y);
+}

+static void secp256k1_ge_from_storage(secp256k1_ge_t *r, const secp256k1_ge_storage_t *a) {
+    secp256k1_fe_from_storage(&r->x, &a->x);
+    secp256k1_fe_from_storage(&r->y, &a->y);
+    r->infinity = 0;
+}

-static void secp256k1_gej_get_hex(char *r, int *rlen, const secp256k1_gej_t *a) {
-    secp256k1_gej_t c = *a;
-    secp256k1_ge_t t; secp256k1_ge_set_gej(&t, &c);
-    secp256k1_ge_get_hex(r, rlen, &t);
+static SECP256K1_INLINE void secp256k1_ge_storage_cmov(secp256k1_ge_storage_t *r, const secp256k1_ge_storage_t *a, int flag) {
+    secp256k1_fe_storage_cmov(&r->x, &a->x, flag);
+    secp256k1_fe_storage_cmov(&r->y, &a->y, flag);
 }

 #ifdef USE_ENDOMORPHISM
 static void secp256k1_gej_mul_lambda(secp256k1_gej_t *r, const secp256k1_gej_t *a) {
-    const secp256k1_fe_t *beta = &secp256k1_ge_consts->beta;
+    static const secp256k1_fe_t beta = SECP256K1_FE_CONST(
+        0x7ae96a2bul, 0x657c0710ul, 0x6e64479eul, 0xac3434e9ul,
+        0x9cf04975ul, 0x12f58995ul, 0xc1396c28ul, 0x719501eeul
+    );
    *r = *a;
-    secp256k1_fe_mul(&r->x, &r->x, beta);
+    secp256k1_fe_mul(&r->x, &r->x, &beta);
 }
 #endif

-static void secp256k1_ge_start(void) {
-    static const unsigned char secp256k1_ge_consts_g_x[] = {
-        0x79,0xBE,0x66,0x7E,0xF9,0xDC,0xBB,0xAC,
-        0x55,0xA0,0x62,0x95,0xCE,0x87,0x0B,0x07,
-        0x02,0x9B,0xFC,0xDB,0x2D,0xCE,0x28,0xD9,
-        0x59,0xF2,0x81,0x5B,0x16,0xF8,0x17,0x98
-    };
-    static const unsigned char secp256k1_ge_consts_g_y[] = {
-        0x48,0x3A,0xDA,0x77,0x26,0xA3,0xC4,0x65,
-        0x5D,0xA4,0xFB,0xFC,0x0E,0x11,0x08,0xA8,
-        0xFD,0x17,0xB4,0x48,0xA6,0x85,0x54,0x19,
-        0x9C,0x47,0xD0,0x8F,0xFB,0x10,0xD4,0xB8
-    };
-#ifdef USE_ENDOMORPHISM
-    /* properties of secp256k1's efficiently computable endomorphism */
-    static const unsigned char secp256k1_ge_consts_beta[] = {
-        0x7a,0xe9,0x6a,0x2b,0x65,0x7c,0x07,0x10,
-        0x6e,0x64,0x47,0x9e,0xac,0x34,0x34,0xe9,
-        0x9c,0xf0,0x49,0x75,0x12,0xf5,0x89,0x95,
-        0xc1,0x39,0x6c,0x28,0x71,0x95,0x01,0xee
-    };
-#endif
-    if (secp256k1_ge_consts == NULL) {
-        secp256k1_ge_consts_t *ret = (secp256k1_ge_consts_t*)checked_malloc(sizeof(secp256k1_ge_consts_t));
-#ifdef USE_ENDOMORPHISM
-        VERIFY_CHECK(secp256k1_fe_set_b32(&ret->beta, secp256k1_ge_consts_beta));
-#endif
-        secp256k1_fe_t g_x, g_y;
-        VERIFY_CHECK(secp256k1_fe_set_b32(&g_x, secp256k1_ge_consts_g_x));
-        VERIFY_CHECK(secp256k1_fe_set_b32(&g_y, secp256k1_ge_consts_g_y));
-        secp256k1_ge_set_xy(&ret->g, &g_x, &g_y);
-        secp256k1_ge_consts = ret;
-    }
-}
-
-static void secp256k1_ge_stop(void) {
-    if (secp256k1_ge_consts != NULL) {
-        secp256k1_ge_consts_t *c = (secp256k1_ge_consts_t*)secp256k1_ge_consts;
-        free((void*)c);
-        secp256k1_ge_consts = NULL;
-    }
-}
-
 #endif
--- a/src/hash.h
+++ b/src/hash.h
@ -12,7 +12,7 @@

 typedef struct {
    uint32_t s[32];
-    unsigned char buf[64];
+    uint32_t buf[16]; /* In big endian */
    size_t bytes;
 } secp256k1_sha256_t;

@ -34,7 +34,7 @@ typedef struct {
    int retry;
 } secp256k1_rfc6979_hmac_sha256_t;

-static void secp256k1_rfc6979_hmac_sha256_initialize(secp256k1_rfc6979_hmac_sha256_t *rng, const unsigned char *key, size_t keylen, const unsigned char *msg, size_t msglen);
+static void secp256k1_rfc6979_hmac_sha256_initialize(secp256k1_rfc6979_hmac_sha256_t *rng, const unsigned char *key, size_t keylen, const unsigned char *msg, size_t msglen, const unsigned char *rnd, size_t rndlen);
 static void secp256k1_rfc6979_hmac_sha256_generate(secp256k1_rfc6979_hmac_sha256_t *rng, unsigned char *out, size_t outlen);
 static void secp256k1_rfc6979_hmac_sha256_finalize(secp256k1_rfc6979_hmac_sha256_t *rng);

--- a/src/hash_impl.h
+++ b/src/hash_impl.h
@ -11,6 +11,7 @@

 #include <stdlib.h>
 #include <stdint.h>
+#include <string.h>

 #define Ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
 #define Maj(x,y,z) (((x) & (y)) | ((z) & ((x) | (y))))
@ -26,8 +27,11 @@
    (h) = t1 + t2; \
 } while(0)

-#define ReadBE32(p) (((uint32_t)((p)[0])) << 24 | ((uint32_t)((p)[1])) << 16 | ((uint32_t)((p)[2])) << 8 | ((uint32_t)((p)[3])))
-#define WriteBE32(p, v) do { (p)[0] = (v) >> 24; (p)[1] = (v) >> 16; (p)[2] = (v) >> 8; (p)[3] = (v); } while(0)
+#ifdef WORDS_BIGENDIAN
+#define BE32(x) (x)
+#else
+#define BE32(p) ((((p) & 0xFF) << 24) | (((p) & 0xFF00) << 8) | (((p) & 0xFF0000) >> 8) | (((p) & 0xFF000000) >> 24))
+#endif

 static void secp256k1_sha256_initialize(secp256k1_sha256_t *hash) {
    hash->s[0] = 0x6a09e667ul;
@ -41,27 +45,27 @@ static void secp256k1_sha256_initialize(secp256k1_sha256_t *hash) {
    hash->bytes = 0;
 }

-/** Perform one SHA-256 transformation, processing a 64-byte chunk. */
-static void secp256k1_sha256_transform(uint32_t* s, const unsigned char* chunk) {
+/** Perform one SHA-256 transformation, processing 16 big endian 32-bit words. */
+static void secp256k1_sha256_transform(uint32_t* s, const uint32_t* chunk) {
    uint32_t a = s[0], b = s[1], c = s[2], d = s[3], e = s[4], f = s[5], g = s[6], h = s[7];
    uint32_t w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;

-    Round(a, b, c, d, e, f, g, h, 0x428a2f98, w0 = ReadBE32(chunk + 0));
-    Round(h, a, b, c, d, e, f, g, 0x71374491, w1 = ReadBE32(chunk + 4));
-    Round(g, h, a, b, c, d, e, f, 0xb5c0fbcf, w2 = ReadBE32(chunk + 8));
-    Round(f, g, h, a, b, c, d, e, 0xe9b5dba5, w3 = ReadBE32(chunk + 12));
-    Round(e, f, g, h, a, b, c, d, 0x3956c25b, w4 = ReadBE32(chunk + 16));
-    Round(d, e, f, g, h, a, b, c, 0x59f111f1, w5 = ReadBE32(chunk + 20));
-    Round(c, d, e, f, g, h, a, b, 0x923f82a4, w6 = ReadBE32(chunk + 24));
-    Round(b, c, d, e, f, g, h, a, 0xab1c5ed5, w7 = ReadBE32(chunk + 28));
-    Round(a, b, c, d, e, f, g, h, 0xd807aa98, w8 = ReadBE32(chunk + 32));
-    Round(h, a, b, c, d, e, f, g, 0x12835b01, w9 = ReadBE32(chunk + 36));
-    Round(g, h, a, b, c, d, e, f, 0x243185be, w10 = ReadBE32(chunk + 40));
-    Round(f, g, h, a, b, c, d, e, 0x550c7dc3, w11 = ReadBE32(chunk + 44));
-    Round(e, f, g, h, a, b, c, d, 0x72be5d74, w12 = ReadBE32(chunk + 48));
-    Round(d, e, f, g, h, a, b, c, 0x80deb1fe, w13 = ReadBE32(chunk + 52));
-    Round(c, d, e, f, g, h, a, b, 0x9bdc06a7, w14 = ReadBE32(chunk + 56));
-    Round(b, c, d, e, f, g, h, a, 0xc19bf174, w15 = ReadBE32(chunk + 60));
+    Round(a, b, c, d, e, f, g, h, 0x428a2f98, w0 = BE32(chunk[0]));
+    Round(h, a, b, c, d, e, f, g, 0x71374491, w1 = BE32(chunk[1]));
+    Round(g, h, a, b, c, d, e, f, 0xb5c0fbcf, w2 = BE32(chunk[2]));
+    Round(f, g, h, a, b, c, d, e, 0xe9b5dba5, w3 = BE32(chunk[3]));
+    Round(e, f, g, h, a, b, c, d, 0x3956c25b, w4 = BE32(chunk[4]));
+    Round(d, e, f, g, h, a, b, c, 0x59f111f1, w5 = BE32(chunk[5]));
+    Round(c, d, e, f, g, h, a, b, 0x923f82a4, w6 = BE32(chunk[6]));
+    Round(b, c, d, e, f, g, h, a, 0xab1c5ed5, w7 = BE32(chunk[7]));
+    Round(a, b, c, d, e, f, g, h, 0xd807aa98, w8 = BE32(chunk[8]));
+    Round(h, a, b, c, d, e, f, g, 0x12835b01, w9 = BE32(chunk[9]));
+    Round(g, h, a, b, c, d, e, f, 0x243185be, w10 = BE32(chunk[10]));
+    Round(f, g, h, a, b, c, d, e, 0x550c7dc3, w11 = BE32(chunk[11]));
+    Round(e, f, g, h, a, b, c, d, 0x72be5d74, w12 = BE32(chunk[12]));
+    Round(d, e, f, g, h, a, b, c, 0x80deb1fe, w13 = BE32(chunk[13]));
+    Round(c, d, e, f, g, h, a, b, 0x9bdc06a7, w14 = BE32(chunk[14]));
+    Round(b, c, d, e, f, g, h, a, 0xc19bf174, w15 = BE32(chunk[15]));

    Round(a, b, c, d, e, f, g, h, 0xe49b69c1, w0 += sigma1(w14) + w9 + sigma0(w1));
    Round(h, a, b, c, d, e, f, g, 0xefbe4786, w1 += sigma1(w15) + w10 + sigma0(w2));
@ -125,55 +129,40 @@ static void secp256k1_sha256_transform(uint32_t* s, const unsigned char* chunk)
 }

 static void secp256k1_sha256_write(secp256k1_sha256_t *hash, const unsigned char *data, size_t len) {
-    const unsigned char* end = data + len;
-    size_t bufsize = hash->bytes % 64;
-    if (bufsize && bufsize + len >= 64) {
-        // Fill the buffer, and process it.
-        memcpy(hash->buf + bufsize, data, 64 - bufsize);
-        hash->bytes += 64 - bufsize;
+    size_t bufsize = hash->bytes & 0x3F;
+    hash->bytes += len;
+    while (bufsize + len >= 64) {
+        /* Fill the buffer, and process it. */
+        memcpy(((unsigned char*)hash->buf) + bufsize, data, 64 - bufsize);
        data += 64 - bufsize;
+        len -= 64 - bufsize;
        secp256k1_sha256_transform(hash->s, hash->buf);
        bufsize = 0;
    }
-    while (end >= data + 64) {
-        // Process full chunks directly from the source.
-        secp256k1_sha256_transform(hash->s, data);
-        hash->bytes += 64;
-        data += 64;
-    }
-    if (end > data) {
-        // Fill the buffer with what remains.
-        memcpy(hash->buf + bufsize, data, end - data);
-        hash->bytes += end - data;
+    if (len) {
+        /* Fill the buffer with what remains. */
+        memcpy(((unsigned char*)hash->buf) + bufsize, data, len);
    }
 }

 static void secp256k1_sha256_finalize(secp256k1_sha256_t *hash, unsigned char *out32) {
    static const unsigned char pad[64] = {0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-    unsigned char sizedesc[8];
-    WriteBE32(sizedesc, hash->bytes >> 29);
-    WriteBE32(sizedesc + 4, hash->bytes << 3);
+    uint32_t sizedesc[2];
+    uint32_t out[8];
+    int i = 0;
+    sizedesc[0] = BE32(hash->bytes >> 29);
+    sizedesc[1] = BE32(hash->bytes << 3);
    secp256k1_sha256_write(hash, pad, 1 + ((119 - (hash->bytes % 64)) % 64));
-    secp256k1_sha256_write(hash, sizedesc, 8);
-    WriteBE32(out32, hash->s[0]);
-    hash->s[0] = 0;
-    WriteBE32(out32 + 4, hash->s[1]);
-    hash->s[1] = 0;
-    WriteBE32(out32 + 8, hash->s[2]);
-    hash->s[2] = 0;
-    WriteBE32(out32 + 12, hash->s[3]);
-    hash->s[3] = 0;
-    WriteBE32(out32 + 16, hash->s[4]);
-    hash->s[4] = 0;
-    WriteBE32(out32 + 20, hash->s[5]);
-    hash->s[5] = 0;
-    WriteBE32(out32 + 24, hash->s[6]);
-    hash->s[6] = 0;
-    WriteBE32(out32 + 28, hash->s[7]);
-    hash->s[7] = 0;
+    secp256k1_sha256_write(hash, (const unsigned char*)sizedesc, 8);
+    for (i = 0; i < 8; i++) {
+        out[i] = BE32(hash->s[i]);
+        hash->s[i] = 0;
+    }
+    memcpy(out32, (const unsigned char*)out, 32);
 }

 static void secp256k1_hmac_sha256_initialize(secp256k1_hmac_sha256_t *hash, const unsigned char *key, size_t keylen) {
+    int n;
    unsigned char rkey[64];
    if (keylen <= 64) {
        memcpy(rkey, key, keylen);
@ -187,12 +176,12 @@ static void secp256k1_hmac_sha256_initialize(secp256k1_hmac_sha256_t *hash, cons
    }

    secp256k1_sha256_initialize(&hash->outer);
-    for (int n = 0; n < 64; n++)
+    for (n = 0; n < 64; n++)
        rkey[n] ^= 0x5c;
    secp256k1_sha256_write(&hash->outer, rkey, 64);

    secp256k1_sha256_initialize(&hash->inner);
-    for (int n = 0; n < 64; n++)
+    for (n = 0; n < 64; n++)
        rkey[n] ^= 0x5c ^ 0x36;
    secp256k1_sha256_write(&hash->inner, rkey, 64);
    memset(rkey, 0, 64);
@ -211,19 +200,22 @@ static void secp256k1_hmac_sha256_finalize(secp256k1_hmac_sha256_t *hash, unsign
 }


-static void secp256k1_rfc6979_hmac_sha256_initialize(secp256k1_rfc6979_hmac_sha256_t *rng, const unsigned char *key, size_t keylen, const unsigned char *msg, size_t msglen) {
+static void secp256k1_rfc6979_hmac_sha256_initialize(secp256k1_rfc6979_hmac_sha256_t *rng, const unsigned char *key, size_t keylen, const unsigned char *msg, size_t msglen, const unsigned char *rnd, size_t rndlen) {
+    secp256k1_hmac_sha256_t hmac;
    static const unsigned char zero[1] = {0x00};
    static const unsigned char one[1] = {0x01};

    memset(rng->v, 0x01, 32);
    memset(rng->k, 0x00, 32);

-    secp256k1_hmac_sha256_t hmac;
    secp256k1_hmac_sha256_initialize(&hmac, rng->k, 32);
    secp256k1_hmac_sha256_write(&hmac, rng->v, 32);
    secp256k1_hmac_sha256_write(&hmac, zero, 1);
    secp256k1_hmac_sha256_write(&hmac, key, keylen);
    secp256k1_hmac_sha256_write(&hmac, msg, msglen);
+    if (rnd && rndlen) {
+        secp256k1_hmac_sha256_write(&hmac, rnd, rndlen);
+    }
    secp256k1_hmac_sha256_finalize(&hmac, rng->k);
    secp256k1_hmac_sha256_initialize(&hmac, rng->k, 32);
    secp256k1_hmac_sha256_write(&hmac, rng->v, 32);
@ -234,6 +226,9 @@ static void secp256k1_rfc6979_hmac_sha256_initialize(secp256k1_rfc6979_hmac_sha2
    secp256k1_hmac_sha256_write(&hmac, one, 1);
    secp256k1_hmac_sha256_write(&hmac, key, keylen);
    secp256k1_hmac_sha256_write(&hmac, msg, msglen);
+    if (rnd && rndlen) {
+        secp256k1_hmac_sha256_write(&hmac, rnd, rndlen);
+    }
    secp256k1_hmac_sha256_finalize(&hmac, rng->k);
    secp256k1_hmac_sha256_initialize(&hmac, rng->k, 32);
    secp256k1_hmac_sha256_write(&hmac, rng->v, 32);
@ -256,10 +251,10 @@ static void secp256k1_rfc6979_hmac_sha256_generate(secp256k1_rfc6979_hmac_sha256

    while (outlen > 0) {
        secp256k1_hmac_sha256_t hmac;
+        int now = outlen;
        secp256k1_hmac_sha256_initialize(&hmac, rng->k, 32);
        secp256k1_hmac_sha256_write(&hmac, rng->v, 32);
        secp256k1_hmac_sha256_finalize(&hmac, rng->v);
-        int now = outlen;
        if (now > 32) {
            now = 32;
        }
--- a/src/num_gmp_impl.h
+++ b/src/num_gmp_impl.h
@ -29,10 +29,10 @@ static void secp256k1_num_copy(secp256k1_num_t *r, const secp256k1_num_t *a) {
 static void secp256k1_num_get_bin(unsigned char *r, unsigned int rlen, const secp256k1_num_t *a) {
    unsigned char tmp[65];
    int len = 0;
+    int shift = 0;
    if (a->limbs>1 || a->data[0] != 0) {
        len = mpn_get_str(tmp, 256, (mp_limb_t*)a->data, a->limbs);
    }
-    int shift = 0;
    while (shift < len && tmp[shift] == 0) shift++;
    VERIFY_CHECK(len-shift <= (int)rlen);
    memset(r, 0, rlen - len + shift);
@ -43,9 +43,10 @@ static void secp256k1_num_get_bin(unsigned char *r, unsigned int rlen, const sec
 }

 static void secp256k1_num_set_bin(secp256k1_num_t *r, const unsigned char *a, unsigned int alen) {
+    int len;
    VERIFY_CHECK(alen > 0);
    VERIFY_CHECK(alen <= 64);
-    int len = mpn_set_str(r->data, a, alen, 256);
+    len = mpn_set_str(r->data, a, alen, 256);
    if (len == 0) {
        r->data[0] = 0;
        len = 1;
@ -91,6 +92,12 @@ static void secp256k1_num_mod(secp256k1_num_t *r, const secp256k1_num_t *m) {
 }

 static void secp256k1_num_mod_inverse(secp256k1_num_t *r, const secp256k1_num_t *a, const secp256k1_num_t *m) {
+    int i;
+    mp_limb_t g[NUM_LIMBS+1];
+    mp_limb_t u[NUM_LIMBS+1];
+    mp_limb_t v[NUM_LIMBS+1];
+    mp_size_t sn;
+    mp_size_t gn;
    secp256k1_num_sanity(a);
    secp256k1_num_sanity(m);

@ -106,15 +113,12 @@ static void secp256k1_num_mod_inverse(secp256k1_num_t *r, const secp256k1_num_t
     */
    VERIFY_CHECK(m->limbs <= NUM_LIMBS);
    VERIFY_CHECK(m->data[m->limbs-1] != 0);
-    mp_limb_t g[NUM_LIMBS+1];
-    mp_limb_t u[NUM_LIMBS+1];
-    mp_limb_t v[NUM_LIMBS+1];
-    for (int i=0; i < m->limbs; i++) {
+    for (i = 0; i < m->limbs; i++) {
        u[i] = (i < a->limbs) ? a->data[i] : 0;
        v[i] = m->data[i];
    }
-    mp_size_t sn = NUM_LIMBS+1;
-    mp_size_t gn = mpn_gcdext(g, r->data, &sn, u, m->limbs, v, m->limbs);
+    sn = NUM_LIMBS+1;
+    gn = mpn_gcdext(g, r->data, &sn, u, m->limbs, v, m->limbs);
    VERIFY_CHECK(gn == 1);
    VERIFY_CHECK(g[0] == 1);
    r->neg = a->neg ^ m->neg;
@ -183,10 +187,10 @@ static void secp256k1_num_sub(secp256k1_num_t *r, const secp256k1_num_t *a, cons
 }

 static void secp256k1_num_mul(secp256k1_num_t *r, const secp256k1_num_t *a, const secp256k1_num_t *b) {
+    mp_limb_t tmp[2*NUM_LIMBS+1];
    secp256k1_num_sanity(a);
    secp256k1_num_sanity(b);

-    mp_limb_t tmp[2*NUM_LIMBS+1];
    VERIFY_CHECK(a->limbs + b->limbs <= 2*NUM_LIMBS+1);
    if ((a->limbs==1 && a->data[0]==0) || (b->limbs==1 && b->data[0]==0)) {
        r->limbs = 1;
@ -207,13 +211,14 @@ static void secp256k1_num_mul(secp256k1_num_t *r, const secp256k1_num_t *a, cons
 }

 static void secp256k1_num_shift(secp256k1_num_t *r, int bits) {
+    int i;
    if (bits % GMP_NUMB_BITS) {
-        // Shift within limbs.
+        /* Shift within limbs. */
        mpn_rshift(r->data, r->data, r->limbs, bits % GMP_NUMB_BITS);
    }
    if (bits >= GMP_NUMB_BITS) {
-        // Shift full limbs.
-        for (int i = 0; i < r->limbs; i++) {
+        /* Shift full limbs. */
+        for (i = 0; i < r->limbs; i++) {
            int index = i + (bits / GMP_NUMB_BITS);
            if (index < r->limbs && index < 2*NUM_LIMBS) {
                r->data[i] = r->data[index];
--- a/src/scalar.h
+++ b/src/scalar.h
@ -21,9 +21,6 @@
 #error "Please select scalar implementation"
 #endif

-static void secp256k1_scalar_start(void);
-static void secp256k1_scalar_stop(void);
-
 /** Clear a scalar to prevent the leak of sensitive data. */
 static void secp256k1_scalar_clear(secp256k1_scalar_t *r);

@ -83,9 +80,9 @@ static void secp256k1_scalar_order_get_num(secp256k1_num_t *r);
 /** Compare two scalars. */
 static int secp256k1_scalar_eq(const secp256k1_scalar_t *a, const secp256k1_scalar_t *b);

-static void secp256k1_scalar_split_128(secp256k1_scalar_t *r1, secp256k1_scalar_t *r2, const secp256k1_scalar_t *a);
-
 #ifdef USE_ENDOMORPHISM
+/** Find r1 and r2 such that r1+r2*2^128 = a. */
+static void secp256k1_scalar_split_128(secp256k1_scalar_t *r1, secp256k1_scalar_t *r2, const secp256k1_scalar_t *a);
 /** Find r1 and r2 such that r1+r2*lambda = a, and r1 and r2 are maximum 128 bits long (see secp256k1_gej_mul_lambda). */
 static void secp256k1_scalar_split_lambda_var(secp256k1_scalar_t *r1, secp256k1_scalar_t *r2, const secp256k1_scalar_t *a);
 #endif
--- a/src/scalar_4x64.h
+++ b/src/scalar_4x64.h
@ -14,4 +14,6 @@ typedef struct {
    uint64_t d[4];
 } secp256k1_scalar_t;

+#define SECP256K1_SCALAR_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {{((uint64_t)(d1)) << 32 | (d0), ((uint64_t)(d3)) << 32 | (d2), ((uint64_t)(d5)) << 32 | (d4), ((uint64_t)(d7)) << 32 | (d6)}}
+
 #endif
--- a/src/scalar_4x64_impl.h
+++ b/src/scalar_4x64_impl.h
@ -7,8 +7,6 @@
 #ifndef _SECP256K1_SCALAR_REPR_IMPL_H_
 #define _SECP256K1_SCALAR_REPR_IMPL_H_

-typedef unsigned __int128 uint128_t;
-
 /* Limbs of the secp256k1 order. */
 #define SECP256K1_N_0 ((uint64_t)0xBFD25E8CD0364141ULL)
 #define SECP256K1_N_1 ((uint64_t)0xBAAEDCE6AF48A03BULL)
@ -69,8 +67,9 @@ SECP256K1_INLINE static int secp256k1_scalar_check_overflow(const secp256k1_scal
 }

 SECP256K1_INLINE static int secp256k1_scalar_reduce(secp256k1_scalar_t *r, unsigned int overflow) {
+    uint128_t t;
    VERIFY_CHECK(overflow <= 1);
-    uint128_t t = (uint128_t)r->d[0] + overflow * SECP256K1_N_C_0;
+    t = (uint128_t)r->d[0] + overflow * SECP256K1_N_C_0;
    r->d[0] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
    t += (uint128_t)r->d[1] + overflow * SECP256K1_N_C_1;
    r->d[1] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
@ -82,6 +81,7 @@ SECP256K1_INLINE static int secp256k1_scalar_reduce(secp256k1_scalar_t *r, unsig
 }

 static int secp256k1_scalar_add(secp256k1_scalar_t *r, const secp256k1_scalar_t *a, const secp256k1_scalar_t *b) {
+    int overflow;
    uint128_t t = (uint128_t)a->d[0] + b->d[0];
    r->d[0] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
    t += (uint128_t)a->d[1] + b->d[1];
@ -90,15 +90,16 @@ static int secp256k1_scalar_add(secp256k1_scalar_t *r, const secp256k1_scalar_t
    r->d[2] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
    t += (uint128_t)a->d[3] + b->d[3];
    r->d[3] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
-    int overflow = t + secp256k1_scalar_check_overflow(r);
+    overflow = t + secp256k1_scalar_check_overflow(r);
    VERIFY_CHECK(overflow == 0 || overflow == 1);
    secp256k1_scalar_reduce(r, overflow);
    return overflow;
 }

 static void secp256k1_scalar_add_bit(secp256k1_scalar_t *r, unsigned int bit) {
+    uint128_t t;
    VERIFY_CHECK(bit < 256);
-    uint128_t t = (uint128_t)r->d[0] + (((uint64_t)((bit >> 6) == 0)) << (bit & 0x3F));
+    t = (uint128_t)r->d[0] + (((uint64_t)((bit >> 6) == 0)) << (bit & 0x3F));
    r->d[0] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
    t += (uint128_t)r->d[1] + (((uint64_t)((bit >> 6) == 1)) << (bit & 0x3F));
    r->d[1] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
@ -113,11 +114,12 @@ static void secp256k1_scalar_add_bit(secp256k1_scalar_t *r, unsigned int bit) {
 }

 static void secp256k1_scalar_set_b32(secp256k1_scalar_t *r, const unsigned char *b32, int *overflow) {
+    int over;
    r->d[0] = (uint64_t)b32[31] | (uint64_t)b32[30] << 8 | (uint64_t)b32[29] << 16 | (uint64_t)b32[28] << 24 | (uint64_t)b32[27] << 32 | (uint64_t)b32[26] << 40 | (uint64_t)b32[25] << 48 | (uint64_t)b32[24] << 56;
    r->d[1] = (uint64_t)b32[23] | (uint64_t)b32[22] << 8 | (uint64_t)b32[21] << 16 | (uint64_t)b32[20] << 24 | (uint64_t)b32[19] << 32 | (uint64_t)b32[18] << 40 | (uint64_t)b32[17] << 48 | (uint64_t)b32[16] << 56;
    r->d[2] = (uint64_t)b32[15] | (uint64_t)b32[14] << 8 | (uint64_t)b32[13] << 16 | (uint64_t)b32[12] << 24 | (uint64_t)b32[11] << 32 | (uint64_t)b32[10] << 40 | (uint64_t)b32[9] << 48 | (uint64_t)b32[8] << 56;
    r->d[3] = (uint64_t)b32[7] | (uint64_t)b32[6] << 8 | (uint64_t)b32[5] << 16 | (uint64_t)b32[4] << 24 | (uint64_t)b32[3] << 32 | (uint64_t)b32[2] << 40 | (uint64_t)b32[1] << 48 | (uint64_t)b32[0] << 56;
-    int over = secp256k1_scalar_reduce(r, secp256k1_scalar_check_overflow(r));
+    over = secp256k1_scalar_reduce(r, secp256k1_scalar_check_overflow(r));
    if (overflow) {
        *overflow = over;
    }
@ -195,16 +197,16 @@ static int secp256k1_scalar_is_high(const secp256k1_scalar_t *a) {

 /** Add 2*a*b to the number defined by (c0,c1,c2). c2 must never overflow. */
 #define muladd2(a,b) { \
-    uint64_t tl, th; \
+    uint64_t tl, th, th2, tl2; \
    { \
        uint128_t t = (uint128_t)a * b; \
        th = t >> 64;               /* at most 0xFFFFFFFFFFFFFFFE */ \
        tl = t; \
    } \
-    uint64_t th2 = th + th;         /* at most 0xFFFFFFFFFFFFFFFE (in case th was 0x7FFFFFFFFFFFFFFF) */ \
+    th2 = th + th;                  /* at most 0xFFFFFFFFFFFFFFFE (in case th was 0x7FFFFFFFFFFFFFFF) */ \
    c2 += (th2 < th) ? 1 : 0;       /* never overflows by contract (verified the next line) */ \
    VERIFY_CHECK((th2 >= th) || (c2 != 0)); \
-    uint64_t tl2 = tl + tl;         /* at most 0xFFFFFFFFFFFFFFFE (in case the lowest 63 bits of tl were 0x7FFFFFFFFFFFFFFF) */ \
+    tl2 = tl + tl;                  /* at most 0xFFFFFFFFFFFFFFFE (in case the lowest 63 bits of tl were 0x7FFFFFFFFFFFFFFF) */ \
    th2 += (tl2 < tl) ? 1 : 0;      /* at most 0xFFFFFFFFFFFFFFFF */ \
    c0 += tl2;                      /* overflow is handled on the next line */ \
    th2 += (c0 < tl2) ? 1 : 0;      /* second overflow is handled on the next line */ \
@ -217,8 +219,9 @@ static int secp256k1_scalar_is_high(const secp256k1_scalar_t *a) {

 /** Add a to the number defined by (c0,c1,c2). c2 must never overflow. */
 #define sumadd(a) { \
+    unsigned int over; \
    c0 += (a);                  /* overflow is handled on the next line */ \
-    unsigned int over = (c0 < (a)) ? 1 : 0; \
+    over = (c0 < (a)) ? 1 : 0; \
    c1 += over;                 /* overflow is handled on the next line */ \
    c2 += (c1 < over) ? 1 : 0;  /* never overflows by contract */ \
 }
@ -248,63 +251,301 @@ static int secp256k1_scalar_is_high(const secp256k1_scalar_t *a) {
 }

 static void secp256k1_scalar_reduce_512(secp256k1_scalar_t *r, const uint64_t *l) {
-    uint64_t n0 = l[4], n1 = l[5], n2 = l[6], n3 = l[7];
+#ifdef USE_ASM_X86_64
+    /* Reduce 512 bits into 385. */
+    uint64_t m0, m1, m2, m3, m4, m5, m6;
+    uint64_t p0, p1, p2, p3, p4;
+    uint64_t c;
+
+    __asm__ __volatile__(
+    /* Preload. */
+    "movq 32(%%rsi), %%r11\n"
+    "movq 40(%%rsi), %%r12\n"
+    "movq 48(%%rsi), %%r13\n"
+    "movq 56(%%rsi), %%r14\n"
+    /* Initialize r8,r9,r10 */
+    "movq 0(%%rsi), %%r8\n"
+    "movq $0, %%r9\n"
+    "movq $0, %%r10\n"
+    /* (r8,r9) += n0 * c0 */
+    "movq %8, %%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    /* extract m0 */
+    "movq %%r8, %q0\n"
+    "movq $0, %%r8\n"
+    /* (r9,r10) += l1 */
+    "addq 8(%%rsi), %%r9\n"
+    "adcq $0, %%r10\n"
+    /* (r9,r10,r8) += n1 * c0 */
+    "movq %8, %%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* (r9,r10,r8) += n0 * c1 */
+    "movq %9, %%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* extract m1 */
+    "movq %%r9, %q1\n"
+    "movq $0, %%r9\n"
+    /* (r10,r8,r9) += l2 */
+    "addq 16(%%rsi), %%r10\n"
+    "adcq $0, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* (r10,r8,r9) += n2 * c0 */
+    "movq %8, %%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax, %%r10\n"
+    "adcq %%rdx, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* (r10,r8,r9) += n1 * c1 */
+    "movq %9, %%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax, %%r10\n"
+    "adcq %%rdx, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* (r10,r8,r9) += n0 */
+    "addq %%r11, %%r10\n"
+    "adcq $0, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* extract m2 */
+    "movq %%r10, %q2\n"
+    "movq $0, %%r10\n"
+    /* (r8,r9,r10) += l3 */
+    "addq 24(%%rsi), %%r8\n"
+    "adcq $0, %%r9\n"
+    "adcq $0, %%r10\n"
+    /* (r8,r9,r10) += n3 * c0 */
+    "movq %8, %%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    "adcq $0, %%r10\n"
+    /* (r8,r9,r10) += n2 * c1 */
+    "movq %9, %%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    "adcq $0, %%r10\n"
+    /* (r8,r9,r10) += n1 */
+    "addq %%r12, %%r8\n"
+    "adcq $0, %%r9\n"
+    "adcq $0, %%r10\n"
+    /* extract m3 */
+    "movq %%r8, %q3\n"
+    "movq $0, %%r8\n"
+    /* (r9,r10,r8) += n3 * c1 */
+    "movq %9, %%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* (r9,r10,r8) += n2 */
+    "addq %%r13, %%r9\n"
+    "adcq $0, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* extract m4 */
+    "movq %%r9, %q4\n"
+    /* (r10,r8) += n3 */
+    "addq %%r14, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* extract m5 */
+    "movq %%r10, %q5\n"
+    /* extract m6 */
+    "movq %%r8, %q6\n"
+    : "=g"(m0), "=g"(m1), "=g"(m2), "=g"(m3), "=g"(m4), "=g"(m5), "=g"(m6)
+    : "S"(l), "n"(SECP256K1_N_C_0), "n"(SECP256K1_N_C_1)
+    : "rax", "rdx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "cc");

-    /* 160 bit accumulator. */
-    uint64_t c0, c1;
-    uint32_t c2;
+    /* Reduce 385 bits into 258. */
+    __asm__ __volatile__(
+    /* Preload */
+    "movq %q9, %%r11\n"
+    "movq %q10, %%r12\n"
+    "movq %q11, %%r13\n"
+    /* Initialize (r8,r9,r10) */
+    "movq %q5, %%r8\n"
+    "movq $0, %%r9\n"
+    "movq $0, %%r10\n"
+    /* (r8,r9) += m4 * c0 */
+    "movq %12, %%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    /* extract p0 */
+    "movq %%r8, %q0\n"
+    "movq $0, %%r8\n"
+    /* (r9,r10) += m1 */
+    "addq %q6, %%r9\n"
+    "adcq $0, %%r10\n"
+    /* (r9,r10,r8) += m5 * c0 */
+    "movq %12, %%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* (r9,r10,r8) += m4 * c1 */
+    "movq %13, %%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* extract p1 */
+    "movq %%r9, %q1\n"
+    "movq $0, %%r9\n"
+    /* (r10,r8,r9) += m2 */
+    "addq %q7, %%r10\n"
+    "adcq $0, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* (r10,r8,r9) += m6 * c0 */
+    "movq %12, %%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax, %%r10\n"
+    "adcq %%rdx, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* (r10,r8,r9) += m5 * c1 */
+    "movq %13, %%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax, %%r10\n"
+    "adcq %%rdx, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* (r10,r8,r9) += m4 */
+    "addq %%r11, %%r10\n"
+    "adcq $0, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* extract p2 */
+    "movq %%r10, %q2\n"
+    /* (r8,r9) += m3 */
+    "addq %q8, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* (r8,r9) += m6 * c1 */
+    "movq %13, %%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    /* (r8,r9) += m5 */
+    "addq %%r12, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* extract p3 */
+    "movq %%r8, %q3\n"
+    /* (r9) += m6 */
+    "addq %%r13, %%r9\n"
+    /* extract p4 */
+    "movq %%r9, %q4\n"
+    : "=&g"(p0), "=&g"(p1), "=&g"(p2), "=g"(p3), "=g"(p4)
+    : "g"(m0), "g"(m1), "g"(m2), "g"(m3), "g"(m4), "g"(m5), "g"(m6), "n"(SECP256K1_N_C_0), "n"(SECP256K1_N_C_1)
+    : "rax", "rdx", "r8", "r9", "r10", "r11", "r12", "r13", "cc");
+
+    /* Reduce 258 bits into 256. */
+    __asm__ __volatile__(
+    /* Preload */
+    "movq %q5, %%r10\n"
+    /* (rax,rdx) = p4 * c0 */
+    "movq %7, %%rax\n"
+    "mulq %%r10\n"
+    /* (rax,rdx) += p0 */
+    "addq %q1, %%rax\n"
+    "adcq $0, %%rdx\n"
+    /* extract r0 */
+    "movq %%rax, 0(%q6)\n"
+    /* Move to (r8,r9) */
+    "movq %%rdx, %%r8\n"
+    "movq $0, %%r9\n"
+    /* (r8,r9) += p1 */
+    "addq %q2, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* (r8,r9) += p4 * c1 */
+    "movq %8, %%rax\n"
+    "mulq %%r10\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    /* Extract r1 */
+    "movq %%r8, 8(%q6)\n"
+    "movq $0, %%r8\n"
+    /* (r9,r8) += p4 */
+    "addq %%r10, %%r9\n"
+    "adcq $0, %%r8\n"
+    /* (r9,r8) += p2 */
+    "addq %q3, %%r9\n"
+    "adcq $0, %%r8\n"
+    /* Extract r2 */
+    "movq %%r9, 16(%q6)\n"
+    "movq $0, %%r9\n"
+    /* (r8,r9) += p3 */
+    "addq %q4, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* Extract r3 */
+    "movq %%r8, 24(%q6)\n"
+    /* Extract c */
+    "movq %%r9, %q0\n"
+    : "=g"(c)
+    : "g"(p0), "g"(p1), "g"(p2), "g"(p3), "g"(p4), "D"(r), "n"(SECP256K1_N_C_0), "n"(SECP256K1_N_C_1)
+    : "rax", "rdx", "r8", "r9", "r10", "cc", "memory");
+#else
+    uint128_t c;
+    uint64_t c0, c1, c2;
+    uint64_t n0 = l[4], n1 = l[5], n2 = l[6], n3 = l[7];
+    uint64_t m0, m1, m2, m3, m4, m5;
+    uint32_t m6;
+    uint64_t p0, p1, p2, p3;
+    uint32_t p4;

    /* Reduce 512 bits into 385. */
    /* m[0..6] = l[0..3] + n[0..3] * SECP256K1_N_C. */
    c0 = l[0]; c1 = 0; c2 = 0;
    muladd_fast(n0, SECP256K1_N_C_0);
-    uint64_t m0; extract_fast(m0);
+    extract_fast(m0);
    sumadd_fast(l[1]);
    muladd(n1, SECP256K1_N_C_0);
    muladd(n0, SECP256K1_N_C_1);
-    uint64_t m1; extract(m1);
+    extract(m1);
    sumadd(l[2]);
    muladd(n2, SECP256K1_N_C_0);
    muladd(n1, SECP256K1_N_C_1);
    sumadd(n0);
-    uint64_t m2; extract(m2);
+    extract(m2);
    sumadd(l[3]);
    muladd(n3, SECP256K1_N_C_0);
    muladd(n2, SECP256K1_N_C_1);
    sumadd(n1);
-    uint64_t m3; extract(m3);
+    extract(m3);
    muladd(n3, SECP256K1_N_C_1);
    sumadd(n2);
-    uint64_t m4; extract(m4);
+    extract(m4);
    sumadd_fast(n3);
-    uint64_t m5; extract_fast(m5);
+    extract_fast(m5);
    VERIFY_CHECK(c0 <= 1);
-    uint32_t m6 = c0;
+    m6 = c0;

    /* Reduce 385 bits into 258. */
    /* p[0..4] = m[0..3] + m[4..6] * SECP256K1_N_C. */
    c0 = m0; c1 = 0; c2 = 0;
    muladd_fast(m4, SECP256K1_N_C_0);
-    uint64_t p0; extract_fast(p0);
+    extract_fast(p0);
    sumadd_fast(m1);
    muladd(m5, SECP256K1_N_C_0);
    muladd(m4, SECP256K1_N_C_1);
-    uint64_t p1; extract(p1);
+    extract(p1);
    sumadd(m2);
    muladd(m6, SECP256K1_N_C_0);
    muladd(m5, SECP256K1_N_C_1);
    sumadd(m4);
-    uint64_t p2; extract(p2);
+    extract(p2);
    sumadd_fast(m3);
    muladd_fast(m6, SECP256K1_N_C_1);
    sumadd_fast(m5);
-    uint64_t p3; extract_fast(p3);
-    uint32_t p4 = c0 + m6;
+    extract_fast(p3);
+    p4 = c0 + m6;
    VERIFY_CHECK(p4 <= 2);

    /* Reduce 258 bits into 256. */
    /* r[0..3] = p[0..3] + p[4] * SECP256K1_N_C. */
-    uint128_t c = p0 + (uint128_t)SECP256K1_N_C_0 * p4;
+    c = p0 + (uint128_t)SECP256K1_N_C_0 * p4;
    r->d[0] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64;
    c += p1 + (uint128_t)SECP256K1_N_C_1 * p4;
    r->d[1] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64;
@ -312,12 +553,146 @@ static void secp256k1_scalar_reduce_512(secp256k1_scalar_t *r, const uint64_t *l
    r->d[2] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64;
    c += p3;
    r->d[3] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64;
+#endif

    /* Final reduction of r. */
    secp256k1_scalar_reduce(r, c + secp256k1_scalar_check_overflow(r));
 }

 static void secp256k1_scalar_mul_512(uint64_t l[8], const secp256k1_scalar_t *a, const secp256k1_scalar_t *b) {
+#ifdef USE_ASM_X86_64
+    const uint64_t *pb = b->d;
+    __asm__ __volatile__(
+    /* Preload */
+    "movq 0(%%rdi), %%r15\n"
+    "movq 8(%%rdi), %%rbx\n"
+    "movq 16(%%rdi), %%rcx\n"
+    "movq 0(%%rdx), %%r11\n"
+    "movq 8(%%rdx), %%r12\n"
+    "movq 16(%%rdx), %%r13\n"
+    "movq 24(%%rdx), %%r14\n"
+    /* (rax,rdx) = a0 * b0 */
+    "movq %%r15, %%rax\n"
+    "mulq %%r11\n"
+    /* Extract l0 */
+    "movq %%rax, 0(%%rsi)\n"
+    /* (r8,r9,r10) = (rdx) */
+    "movq %%rdx, %%r8\n"
+    "xorq %%r9, %%r9\n"
+    "xorq %%r10, %%r10\n"
+    /* (r8,r9,r10) += a0 * b1 */
+    "movq %%r15, %%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    "adcq $0, %%r10\n"
+    /* (r8,r9,r10) += a1 * b0 */
+    "movq %%rbx, %%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    "adcq $0, %%r10\n"
+    /* Extract l1 */
+    "movq %%r8, 8(%%rsi)\n"
+    "xorq %%r8, %%r8\n"
+    /* (r9,r10,r8) += a0 * b2 */
+    "movq %%r15, %%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* (r9,r10,r8) += a1 * b1 */
+    "movq %%rbx, %%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* (r9,r10,r8) += a2 * b0 */
+    "movq %%rcx, %%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* Extract l2 */
+    "movq %%r9, 16(%%rsi)\n"
+    "xorq %%r9, %%r9\n"
+    /* (r10,r8,r9) += a0 * b3 */
+    "movq %%r15, %%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax, %%r10\n"
+    "adcq %%rdx, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* Preload a3 */
+    "movq 24(%%rdi), %%r15\n"
+    /* (r10,r8,r9) += a1 * b2 */
+    "movq %%rbx, %%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax, %%r10\n"
+    "adcq %%rdx, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* (r10,r8,r9) += a2 * b1 */
+    "movq %%rcx, %%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax, %%r10\n"
+    "adcq %%rdx, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* (r10,r8,r9) += a3 * b0 */
+    "movq %%r15, %%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax, %%r10\n"
+    "adcq %%rdx, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* Extract l3 */
+    "movq %%r10, 24(%%rsi)\n"
+    "xorq %%r10, %%r10\n"
+    /* (r8,r9,r10) += a1 * b3 */
+    "movq %%rbx, %%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    "adcq $0, %%r10\n"
+    /* (r8,r9,r10) += a2 * b2 */
+    "movq %%rcx, %%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    "adcq $0, %%r10\n"
+    /* (r8,r9,r10) += a3 * b1 */
+    "movq %%r15, %%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    "adcq $0, %%r10\n"
+    /* Extract l4 */
+    "movq %%r8, 32(%%rsi)\n"
+    "xorq %%r8, %%r8\n"
+    /* (r9,r10,r8) += a2 * b3 */
+    "movq %%rcx, %%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* (r9,r10,r8) += a3 * b2 */
+    "movq %%r15, %%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* Extract l5 */
+    "movq %%r9, 40(%%rsi)\n"
+    /* (r10,r8) += a3 * b3 */
+    "movq %%r15, %%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax, %%r10\n"
+    "adcq %%rdx, %%r8\n"
+    /* Extract l6 */
+    "movq %%r10, 48(%%rsi)\n"
+    /* Extract l7 */
+    "movq %%r8, 56(%%rsi)\n"
+    : "+d"(pb)
+    : "S"(l), "D"(a->d)
+    : "rax", "rbx", "rcx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "cc", "memory");
+#else
    /* 160 bit accumulator. */
    uint64_t c0 = 0, c1 = 0;
    uint32_t c2 = 0;
@ -348,9 +723,119 @@ static void secp256k1_scalar_mul_512(uint64_t l[8], const secp256k1_scalar_t *a,
    extract_fast(l[6]);
    VERIFY_CHECK(c1 <= 0);
    l[7] = c0;
+#endif
 }

 static void secp256k1_scalar_sqr_512(uint64_t l[8], const secp256k1_scalar_t *a) {
+#ifdef USE_ASM_X86_64
+    __asm__ __volatile__(
+    /* Preload */
+    "movq 0(%%rdi), %%r11\n"
+    "movq 8(%%rdi), %%r12\n"
+    "movq 16(%%rdi), %%r13\n"
+    "movq 24(%%rdi), %%r14\n"
+    /* (rax,rdx) = a0 * a0 */
+    "movq %%r11, %%rax\n"
+    "mulq %%r11\n"
+    /* Extract l0 */
+    "movq %%rax, 0(%%rsi)\n"
+    /* (r8,r9,r10) = (rdx,0) */
+    "movq %%rdx, %%r8\n"
+    "xorq %%r9, %%r9\n"
+    "xorq %%r10, %%r10\n"
+    /* (r8,r9,r10) += 2 * a0 * a1 */
+    "movq %%r11, %%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    "adcq $0, %%r10\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    "adcq $0, %%r10\n"
+    /* Extract l1 */
+    "movq %%r8, 8(%%rsi)\n"
+    "xorq %%r8, %%r8\n"
+    /* (r9,r10,r8) += 2 * a0 * a2 */
+    "movq %%r11, %%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* (r9,r10,r8) += a1 * a1 */
+    "movq %%r12, %%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* Extract l2 */
+    "movq %%r9, 16(%%rsi)\n"
+    "xorq %%r9, %%r9\n"
+    /* (r10,r8,r9) += 2 * a0 * a3 */
+    "movq %%r11, %%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax, %%r10\n"
+    "adcq %%rdx, %%r8\n"
+    "adcq $0, %%r9\n"
+    "addq %%rax, %%r10\n"
+    "adcq %%rdx, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* (r10,r8,r9) += 2 * a1 * a2 */
+    "movq %%r12, %%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax, %%r10\n"
+    "adcq %%rdx, %%r8\n"
+    "adcq $0, %%r9\n"
+    "addq %%rax, %%r10\n"
+    "adcq %%rdx, %%r8\n"
+    "adcq $0, %%r9\n"
+    /* Extract l3 */
+    "movq %%r10, 24(%%rsi)\n"
+    "xorq %%r10, %%r10\n"
+    /* (r8,r9,r10) += 2 * a1 * a3 */
+    "movq %%r12, %%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    "adcq $0, %%r10\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    "adcq $0, %%r10\n"
+    /* (r8,r9,r10) += a2 * a2 */
+    "movq %%r13, %%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax, %%r8\n"
+    "adcq %%rdx, %%r9\n"
+    "adcq $0, %%r10\n"
+    /* Extract l4 */
+    "movq %%r8, 32(%%rsi)\n"
+    "xorq %%r8, %%r8\n"
+    /* (r9,r10,r8) += 2 * a2 * a3 */
+    "movq %%r13, %%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    "addq %%rax, %%r9\n"
+    "adcq %%rdx, %%r10\n"
+    "adcq $0, %%r8\n"
+    /* Extract l5 */
+    "movq %%r9, 40(%%rsi)\n"
+    /* (r10,r8) += a3 * a3 */
+    "movq %%r14, %%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax, %%r10\n"
+    "adcq %%rdx, %%r8\n"
+    /* Extract l6 */
+    "movq %%r10, 48(%%rsi)\n"
+    /* Extract l7 */
+    "movq %%r8, 56(%%rsi)\n"
+    :
+    : "S"(l), "D"(a->d)
+    : "rax", "rdx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "cc", "memory");
+#else
    /* 160 bit accumulator. */
    uint64_t c0 = 0, c1 = 0;
    uint32_t c2 = 0;
@ -375,6 +860,7 @@ static void secp256k1_scalar_sqr_512(uint64_t l[8], const secp256k1_scalar_t *a)
    extract_fast(l[6]);
    VERIFY_CHECK(c1 == 0);
    l[7] = c0;
+#endif
 }

 #undef sumadd
@ -413,12 +899,15 @@ SECP256K1_INLINE static int secp256k1_scalar_eq(const secp256k1_scalar_t *a, con
 }

 SECP256K1_INLINE static void secp256k1_scalar_mul_shift_var(secp256k1_scalar_t *r, const secp256k1_scalar_t *a, const secp256k1_scalar_t *b, unsigned int shift) {
-    VERIFY_CHECK(shift >= 256);
    uint64_t l[8];
+    unsigned int shiftlimbs;
+    unsigned int shiftlow;
+    unsigned int shifthigh;
+    VERIFY_CHECK(shift >= 256);
    secp256k1_scalar_mul_512(l, a, b);
-    unsigned int shiftlimbs = shift >> 6;
-    unsigned int shiftlow = shift & 0x3F;
-    unsigned int shifthigh = 64 - shiftlow;
+    shiftlimbs = shift >> 6;
+    shiftlow = shift & 0x3F;
+    shifthigh = 64 - shiftlow;
    r->d[0] = shift < 512 ? (l[0 + shiftlimbs] >> shiftlow | (shift < 448 && shiftlow ? (l[1 + shiftlimbs] << shifthigh) : 0)) : 0;
    r->d[1] = shift < 448 ? (l[1 + shiftlimbs] >> shiftlow | (shift < 384 && shiftlow ? (l[2 + shiftlimbs] << shifthigh) : 0)) : 0;
    r->d[2] = shift < 384 ? (l[2 + shiftlimbs] >> shiftlow | (shift < 320 && shiftlow ? (l[3 + shiftlimbs] << shifthigh) : 0)) : 0;
--- a/src/scalar_8x32.h
+++ b/src/scalar_8x32.h
@ -14,4 +14,6 @@ typedef struct {
    uint32_t d[8];
 } secp256k1_scalar_t;

+#define SECP256K1_SCALAR_CONST(d7, d6, d5, d4, d3, d2, d1, d0) {{(d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7)}}
+
 #endif
--- a/src/scalar_8x32_impl.h
+++ b/src/scalar_8x32_impl.h
@ -91,8 +91,9 @@ SECP256K1_INLINE static int secp256k1_scalar_check_overflow(const secp256k1_scal
 }

 SECP256K1_INLINE static int secp256k1_scalar_reduce(secp256k1_scalar_t *r, uint32_t overflow) {
+    uint64_t t;
    VERIFY_CHECK(overflow <= 1);
-    uint64_t t = (uint64_t)r->d[0] + overflow * SECP256K1_N_C_0;
+    t = (uint64_t)r->d[0] + overflow * SECP256K1_N_C_0;
    r->d[0] = t & 0xFFFFFFFFUL; t >>= 32;
    t += (uint64_t)r->d[1] + overflow * SECP256K1_N_C_1;
    r->d[1] = t & 0xFFFFFFFFUL; t >>= 32;
@ -112,6 +113,7 @@ SECP256K1_INLINE static int secp256k1_scalar_reduce(secp256k1_scalar_t *r, uint3
 }

 static int secp256k1_scalar_add(secp256k1_scalar_t *r, const secp256k1_scalar_t *a, const secp256k1_scalar_t *b) {
+    int overflow;
    uint64_t t = (uint64_t)a->d[0] + b->d[0];
    r->d[0] = t & 0xFFFFFFFFULL; t >>= 32;
    t += (uint64_t)a->d[1] + b->d[1];
@ -128,15 +130,16 @@ static int secp256k1_scalar_add(secp256k1_scalar_t *r, const secp256k1_scalar_t
    r->d[6] = t & 0xFFFFFFFFULL; t >>= 32;
    t += (uint64_t)a->d[7] + b->d[7];
    r->d[7] = t & 0xFFFFFFFFULL; t >>= 32;
-    int overflow = t + secp256k1_scalar_check_overflow(r);
+    overflow = t + secp256k1_scalar_check_overflow(r);
    VERIFY_CHECK(overflow == 0 || overflow == 1);
    secp256k1_scalar_reduce(r, overflow);
    return overflow;
 }

 static void secp256k1_scalar_add_bit(secp256k1_scalar_t *r, unsigned int bit) {
+    uint64_t t;
    VERIFY_CHECK(bit < 256);
-    uint64_t t = (uint64_t)r->d[0] + (((uint32_t)((bit >> 5) == 0)) << (bit & 0x1F));
+    t = (uint64_t)r->d[0] + (((uint32_t)((bit >> 5) == 0)) << (bit & 0x1F));
    r->d[0] = t & 0xFFFFFFFFULL; t >>= 32;
    t += (uint64_t)r->d[1] + (((uint32_t)((bit >> 5) == 1)) << (bit & 0x1F));
    r->d[1] = t & 0xFFFFFFFFULL; t >>= 32;
@ -159,6 +162,7 @@ static void secp256k1_scalar_add_bit(secp256k1_scalar_t *r, unsigned int bit) {
 }

 static void secp256k1_scalar_set_b32(secp256k1_scalar_t *r, const unsigned char *b32, int *overflow) {
+    int over;
    r->d[0] = (uint32_t)b32[31] | (uint32_t)b32[30] << 8 | (uint32_t)b32[29] << 16 | (uint32_t)b32[28] << 24;
    r->d[1] = (uint32_t)b32[27] | (uint32_t)b32[26] << 8 | (uint32_t)b32[25] << 16 | (uint32_t)b32[24] << 24;
    r->d[2] = (uint32_t)b32[23] | (uint32_t)b32[22] << 8 | (uint32_t)b32[21] << 16 | (uint32_t)b32[20] << 24;
@ -167,7 +171,7 @@ static void secp256k1_scalar_set_b32(secp256k1_scalar_t *r, const unsigned char
    r->d[5] = (uint32_t)b32[11] | (uint32_t)b32[10] << 8 | (uint32_t)b32[9] << 16 | (uint32_t)b32[8] << 24;
    r->d[6] = (uint32_t)b32[7] | (uint32_t)b32[6] << 8 | (uint32_t)b32[5] << 16 | (uint32_t)b32[4] << 24;
    r->d[7] = (uint32_t)b32[3] | (uint32_t)b32[2] << 8 | (uint32_t)b32[1] << 16 | (uint32_t)b32[0] << 24;
-    int over = secp256k1_scalar_reduce(r, secp256k1_scalar_check_overflow(r));
+    over = secp256k1_scalar_reduce(r, secp256k1_scalar_check_overflow(r));
    if (overflow) {
        *overflow = over;
    }
@ -263,16 +267,16 @@ static int secp256k1_scalar_is_high(const secp256k1_scalar_t *a) {

 /** Add 2*a*b to the number defined by (c0,c1,c2). c2 must never overflow. */
 #define muladd2(a,b) { \
-    uint32_t tl, th; \
+    uint32_t tl, th, th2, tl2; \
    { \
        uint64_t t = (uint64_t)a * b; \
        th = t >> 32;               /* at most 0xFFFFFFFE */ \
        tl = t; \
    } \
-    uint32_t th2 = th + th;         /* at most 0xFFFFFFFE (in case th was 0x7FFFFFFF) */ \
+    th2 = th + th;                  /* at most 0xFFFFFFFE (in case th was 0x7FFFFFFF) */ \
    c2 += (th2 < th) ? 1 : 0;       /* never overflows by contract (verified the next line) */ \
    VERIFY_CHECK((th2 >= th) || (c2 != 0)); \
-    uint32_t tl2 = tl + tl;         /* at most 0xFFFFFFFE (in case the lowest 63 bits of tl were 0x7FFFFFFF) */ \
+    tl2 = tl + tl;                  /* at most 0xFFFFFFFE (in case the lowest 63 bits of tl were 0x7FFFFFFF) */ \
    th2 += (tl2 < tl) ? 1 : 0;      /* at most 0xFFFFFFFF */ \
    c0 += tl2;                      /* overflow is handled on the next line */ \
    th2 += (c0 < tl2) ? 1 : 0;      /* second overflow is handled on the next line */ \
@ -285,8 +289,9 @@ static int secp256k1_scalar_is_high(const secp256k1_scalar_t *a) {

 /** Add a to the number defined by (c0,c1,c2). c2 must never overflow. */
 #define sumadd(a) { \
+    unsigned int over; \
    c0 += (a);                  /* overflow is handled on the next line */ \
-    unsigned int over = (c0 < (a)) ? 1 : 0; \
+    over = (c0 < (a)) ? 1 : 0; \
    c1 += over;                 /* overflow is handled on the next line */ \
    c2 += (c1 < over) ? 1 : 0;  /* never overflows by contract */ \
 }
@ -316,7 +321,10 @@ static int secp256k1_scalar_is_high(const secp256k1_scalar_t *a) {
 }

 static void secp256k1_scalar_reduce_512(secp256k1_scalar_t *r, const uint32_t *l) {
+    uint64_t c;
    uint32_t n0 = l[8], n1 = l[9], n2 = l[10], n3 = l[11], n4 = l[12], n5 = l[13], n6 = l[14], n7 = l[15];
+    uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12;
+    uint32_t p0, p1, p2, p3, p4, p5, p6, p7, p8;

    /* 96 bit accumulator. */
    uint32_t c0, c1, c2;
@ -325,115 +333,115 @@ static void secp256k1_scalar_reduce_512(secp256k1_scalar_t *r, const uint32_t *l
    /* m[0..12] = l[0..7] + n[0..7] * SECP256K1_N_C. */
    c0 = l[0]; c1 = 0; c2 = 0;
    muladd_fast(n0, SECP256K1_N_C_0);
-    uint32_t m0; extract_fast(m0);
+    extract_fast(m0);
    sumadd_fast(l[1]);
    muladd(n1, SECP256K1_N_C_0);
    muladd(n0, SECP256K1_N_C_1);
-    uint32_t m1; extract(m1);
+    extract(m1);
    sumadd(l[2]);
    muladd(n2, SECP256K1_N_C_0);
    muladd(n1, SECP256K1_N_C_1);
    muladd(n0, SECP256K1_N_C_2);
-    uint32_t m2; extract(m2);
+    extract(m2);
    sumadd(l[3]);
    muladd(n3, SECP256K1_N_C_0);
    muladd(n2, SECP256K1_N_C_1);
    muladd(n1, SECP256K1_N_C_2);
    muladd(n0, SECP256K1_N_C_3);
-    uint32_t m3; extract(m3);
+    extract(m3);
    sumadd(l[4]);
    muladd(n4, SECP256K1_N_C_0);
    muladd(n3, SECP256K1_N_C_1);
    muladd(n2, SECP256K1_N_C_2);
    muladd(n1, SECP256K1_N_C_3);
    sumadd(n0);
-    uint32_t m4; extract(m4);
+    extract(m4);
    sumadd(l[5]);
    muladd(n5, SECP256K1_N_C_0);
    muladd(n4, SECP256K1_N_C_1);
    muladd(n3, SECP256K1_N_C_2);
    muladd(n2, SECP256K1_N_C_3);
    sumadd(n1);
-    uint32_t m5; extract(m5);
+    extract(m5);
    sumadd(l[6]);
    muladd(n6, SECP256K1_N_C_0);
    muladd(n5, SECP256K1_N_C_1);
    muladd(n4, SECP256K1_N_C_2);
    muladd(n3, SECP256K1_N_C_3);
    sumadd(n2);
-    uint32_t m6; extract(m6);
+    extract(m6);
    sumadd(l[7]);
    muladd(n7, SECP256K1_N_C_0);
    muladd(n6, SECP256K1_N_C_1);
    muladd(n5, SECP256K1_N_C_2);
    muladd(n4, SECP256K1_N_C_3);
    sumadd(n3);
-    uint32_t m7; extract(m7);
+    extract(m7);
    muladd(n7, SECP256K1_N_C_1);
    muladd(n6, SECP256K1_N_C_2);
    muladd(n5, SECP256K1_N_C_3);
    sumadd(n4);
-    uint32_t m8; extract(m8);
+    extract(m8);
    muladd(n7, SECP256K1_N_C_2);
    muladd(n6, SECP256K1_N_C_3);
    sumadd(n5);
-    uint32_t m9; extract(m9);
+    extract(m9);
    muladd(n7, SECP256K1_N_C_3);
    sumadd(n6);
-    uint32_t m10; extract(m10);
+    extract(m10);
    sumadd_fast(n7);
-    uint32_t m11; extract_fast(m11);
+    extract_fast(m11);
    VERIFY_CHECK(c0 <= 1);
-    uint32_t m12 = c0;
+    m12 = c0;

    /* Reduce 385 bits into 258. */
    /* p[0..8] = m[0..7] + m[8..12] * SECP256K1_N_C. */
    c0 = m0; c1 = 0; c2 = 0;
    muladd_fast(m8, SECP256K1_N_C_0);
-    uint32_t p0; extract_fast(p0);
+    extract_fast(p0);
    sumadd_fast(m1);
    muladd(m9, SECP256K1_N_C_0);
    muladd(m8, SECP256K1_N_C_1);
-    uint32_t p1; extract(p1);
+    extract(p1);
    sumadd(m2);
    muladd(m10, SECP256K1_N_C_0);
    muladd(m9, SECP256K1_N_C_1);
    muladd(m8, SECP256K1_N_C_2);
-    uint32_t p2; extract(p2);
+    extract(p2);
    sumadd(m3);
    muladd(m11, SECP256K1_N_C_0);
    muladd(m10, SECP256K1_N_C_1);
    muladd(m9, SECP256K1_N_C_2);
    muladd(m8, SECP256K1_N_C_3);
-    uint32_t p3; extract(p3);
+    extract(p3);
    sumadd(m4);
    muladd(m12, SECP256K1_N_C_0);
    muladd(m11, SECP256K1_N_C_1);
    muladd(m10, SECP256K1_N_C_2);
    muladd(m9, SECP256K1_N_C_3);
    sumadd(m8);
-    uint32_t p4; extract(p4);
+    extract(p4);
    sumadd(m5);
    muladd(m12, SECP256K1_N_C_1);
    muladd(m11, SECP256K1_N_C_2);
    muladd(m10, SECP256K1_N_C_3);
    sumadd(m9);
-    uint32_t p5; extract(p5);
+    extract(p5);
    sumadd(m6);
    muladd(m12, SECP256K1_N_C_2);
    muladd(m11, SECP256K1_N_C_3);
    sumadd(m10);
-    uint32_t p6; extract(p6);
+    extract(p6);
    sumadd_fast(m7);
    muladd_fast(m12, SECP256K1_N_C_3);
    sumadd_fast(m11);
-    uint32_t p7; extract_fast(p7);
-    uint32_t p8 = c0 + m12;
+    extract_fast(p7);
+    p8 = c0 + m12;
    VERIFY_CHECK(p8 <= 2);

    /* Reduce 258 bits into 256. */
    /* r[0..7] = p[0..7] + p[8] * SECP256K1_N_C. */
-    uint64_t c = p0 + (uint64_t)SECP256K1_N_C_0 * p8;
+    c = p0 + (uint64_t)SECP256K1_N_C_0 * p8;
    r->d[0] = c & 0xFFFFFFFFUL; c >>= 32;
    c += p1 + (uint64_t)SECP256K1_N_C_1 * p8;
    r->d[1] = c & 0xFFFFFFFFUL; c >>= 32;
@ -454,7 +462,7 @@ static void secp256k1_scalar_reduce_512(secp256k1_scalar_t *r, const uint32_t *l
    secp256k1_scalar_reduce(r, c + secp256k1_scalar_check_overflow(r));
 }

-static void secp256k1_scalar_mul_512(uint32_t l[16], const secp256k1_scalar_t *a, const secp256k1_scalar_t *b) {
+static void secp256k1_scalar_mul_512(uint32_t *l, const secp256k1_scalar_t *a, const secp256k1_scalar_t *b) {
    /* 96 bit accumulator. */
    uint32_t c0 = 0, c1 = 0, c2 = 0;

@ -542,7 +550,7 @@ static void secp256k1_scalar_mul_512(uint32_t l[16], const secp256k1_scalar_t *a
    l[15] = c0;
 }

-static void secp256k1_scalar_sqr_512(uint32_t l[16], const secp256k1_scalar_t *a) {
+static void secp256k1_scalar_sqr_512(uint32_t *l, const secp256k1_scalar_t *a) {
    /* 96 bit accumulator. */
    uint32_t c0 = 0, c1 = 0, c2 = 0;

@ -622,6 +630,7 @@ static void secp256k1_scalar_sqr(secp256k1_scalar_t *r, const secp256k1_scalar_t
    secp256k1_scalar_reduce_512(r, l);
 }

+#ifdef USE_ENDOMORPHISM
 static void secp256k1_scalar_split_128(secp256k1_scalar_t *r1, secp256k1_scalar_t *r2, const secp256k1_scalar_t *a) {
    r1->d[0] = a->d[0];
    r1->d[1] = a->d[1];
@ -640,18 +649,22 @@ static void secp256k1_scalar_split_128(secp256k1_scalar_t *r1, secp256k1_scalar_
    r2->d[6] = 0;
    r2->d[7] = 0;
 }
+#endif

 SECP256K1_INLINE static int secp256k1_scalar_eq(const secp256k1_scalar_t *a, const secp256k1_scalar_t *b) {
    return ((a->d[0] ^ b->d[0]) | (a->d[1] ^ b->d[1]) | (a->d[2] ^ b->d[2]) | (a->d[3] ^ b->d[3]) | (a->d[4] ^ b->d[4]) | (a->d[5] ^ b->d[5]) | (a->d[6] ^ b->d[6]) | (a->d[7] ^ b->d[7])) == 0;
 }

 SECP256K1_INLINE static void secp256k1_scalar_mul_shift_var(secp256k1_scalar_t *r, const secp256k1_scalar_t *a, const secp256k1_scalar_t *b, unsigned int shift) {
-    VERIFY_CHECK(shift >= 256);
    uint32_t l[16];
+    unsigned int shiftlimbs;
+    unsigned int shiftlow;
+    unsigned int shifthigh;
+    VERIFY_CHECK(shift >= 256);
    secp256k1_scalar_mul_512(l, a, b);
-    unsigned int shiftlimbs = shift >> 5;
-    unsigned int shiftlow = shift & 0x1F;
-    unsigned int shifthigh = 32 - shiftlow;
+    shiftlimbs = shift >> 5;
+    shiftlow = shift & 0x1F;
+    shifthigh = 32 - shiftlow;
    r->d[0] = shift < 512 ? (l[0 + shiftlimbs] >> shiftlow | (shift < 480 && shiftlow ? (l[1 + shiftlimbs] << shifthigh) : 0)) : 0;
    r->d[1] = shift < 480 ? (l[1 + shiftlimbs] >> shiftlow | (shift < 448 && shiftlow ? (l[2 + shiftlimbs] << shifthigh) : 0)) : 0;
    r->d[2] = shift < 448 ? (l[2 + shiftlimbs] >> shiftlow | (shift < 416 && shiftlow ? (l[3 + shiftlimbs] << shifthigh) : 0)) : 0;
--- a/src/scalar_impl.h
+++ b/src/scalar_impl.h
@ -24,121 +24,6 @@
 #error "Please select scalar implementation"
 #endif

-typedef struct {
-#ifndef USE_NUM_NONE
-    secp256k1_num_t order;
-#endif
-#ifdef USE_ENDOMORPHISM
-    secp256k1_scalar_t minus_lambda, minus_b1, minus_b2, g1, g2;
-#endif
-} secp256k1_scalar_consts_t;
-
-static const secp256k1_scalar_consts_t *secp256k1_scalar_consts = NULL;
-
-static void secp256k1_scalar_start(void) {
-    if (secp256k1_scalar_consts != NULL)
-        return;
-
-    /* Allocate. */
-    secp256k1_scalar_consts_t *ret = (secp256k1_scalar_consts_t*)checked_malloc(sizeof(secp256k1_scalar_consts_t));
-
-#ifndef USE_NUM_NONE
-    static const unsigned char secp256k1_scalar_consts_order[] = {
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFE,
-        0xBA,0xAE,0xDC,0xE6,0xAF,0x48,0xA0,0x3B,
-        0xBF,0xD2,0x5E,0x8C,0xD0,0x36,0x41,0x41
-    };
-    secp256k1_num_set_bin(&ret->order, secp256k1_scalar_consts_order, sizeof(secp256k1_scalar_consts_order));
-#endif
-#ifdef USE_ENDOMORPHISM
-    /**
-     * Lambda is a scalar which has the property for secp256k1 that point multiplication by
-     * it is efficiently computable (see secp256k1_gej_mul_lambda). */
-    static const unsigned char secp256k1_scalar_consts_lambda[32] = {
-         0x53,0x63,0xad,0x4c,0xc0,0x5c,0x30,0xe0,
-         0xa5,0x26,0x1c,0x02,0x88,0x12,0x64,0x5a,
-         0x12,0x2e,0x22,0xea,0x20,0x81,0x66,0x78,
-         0xdf,0x02,0x96,0x7c,0x1b,0x23,0xbd,0x72
-    };
-    /**
-     * "Guide to Elliptic Curve Cryptography" (Hankerson, Menezes, Vanstone) gives an algorithm
-     * (algorithm 3.74) to find k1 and k2 given k, such that k1 + k2 * lambda == k mod n, and k1
-     * and k2 have a small size.
-     * It relies on constants a1, b1, a2, b2. These constants for the value of lambda above are:
-     *
-     * - a1 =      {0x30,0x86,0xd2,0x21,0xa7,0xd4,0x6b,0xcd,0xe8,0x6c,0x90,0xe4,0x92,0x84,0xeb,0x15}
-     * - b1 =     -{0xe4,0x43,0x7e,0xd6,0x01,0x0e,0x88,0x28,0x6f,0x54,0x7f,0xa9,0x0a,0xbf,0xe4,0xc3}
-     * - a2 = {0x01,0x14,0xca,0x50,0xf7,0xa8,0xe2,0xf3,0xf6,0x57,0xc1,0x10,0x8d,0x9d,0x44,0xcf,0xd8}
-     * - b2 =      {0x30,0x86,0xd2,0x21,0xa7,0xd4,0x6b,0xcd,0xe8,0x6c,0x90,0xe4,0x92,0x84,0xeb,0x15}
-     *
-     * The algorithm then computes c1 = round(b1 * k / n) and c2 = round(b2 * k / n), and gives
-     * k1 = k - (c1*a1 + c2*a2) and k2 = -(c1*b1 + c2*b2). Instead, we use modular arithmetic, and
-     * compute k1 as k - k2 * lambda, avoiding the need for constants a1 and a2.
-     *
-     * g1, g2 are precomputed constants used to replace division with a rounded multiplication
-     * when decomposing the scalar for an endomorphism-based point multiplication.
-     *
-     * The possibility of using precomputed estimates is mentioned in "Guide to Elliptic Curve
-     * Cryptography" (Hankerson, Menezes, Vanstone) in section 3.5.
-     *
-     * The derivation is described in the paper "Efficient Software Implementation of Public-Key
-     * Cryptography on Sensor Networks Using the MSP430X Microcontroller" (Gouvea, Oliveira, Lopez),
-     * Section 4.3 (here we use a somewhat higher-precision estimate):
-     * d = a1*b2 - b1*a2
-     * g1 = round((2^272)*b2/d)
-     * g2 = round((2^272)*b1/d)
-     *
-     * (Note that 'd' is also equal to the curve order here because [a1,b1] and [a2,b2] are found
-     * as outputs of the Extended Euclidean Algorithm on inputs 'order' and 'lambda').
-     */
-    static const unsigned char secp256k1_scalar_consts_minus_b1[32] = {
-        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-        0xe4,0x43,0x7e,0xd6,0x01,0x0e,0x88,0x28,
-        0x6f,0x54,0x7f,0xa9,0x0a,0xbf,0xe4,0xc3
-    };
-    static const unsigned char secp256k1_scalar_consts_b2[32] = {
-        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-        0x30,0x86,0xd2,0x21,0xa7,0xd4,0x6b,0xcd,
-        0xe8,0x6c,0x90,0xe4,0x92,0x84,0xeb,0x15
-    };
-    static const unsigned char secp256k1_scalar_consts_g1[32] = {
-        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-        0x00,0x00,0x00,0x00,0x00,0x00,0x30,0x86,
-        0xd2,0x21,0xa7,0xd4,0x6b,0xcd,0xe8,0x6c,
-        0x90,0xe4,0x92,0x84,0xeb,0x15,0x3d,0xab
-    };
-    static const unsigned char secp256k1_scalar_consts_g2[32] = {
-        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-        0x00,0x00,0x00,0x00,0x00,0x00,0xe4,0x43,
-        0x7e,0xd6,0x01,0x0e,0x88,0x28,0x6f,0x54,
-        0x7f,0xa9,0x0a,0xbf,0xe4,0xc4,0x22,0x12
-    };
-
-    secp256k1_scalar_set_b32(&ret->minus_lambda, secp256k1_scalar_consts_lambda, NULL);
-    secp256k1_scalar_negate(&ret->minus_lambda, &ret->minus_lambda);
-    secp256k1_scalar_set_b32(&ret->minus_b1, secp256k1_scalar_consts_minus_b1, NULL);
-    secp256k1_scalar_set_b32(&ret->minus_b2, secp256k1_scalar_consts_b2, NULL);
-    secp256k1_scalar_negate(&ret->minus_b2, &ret->minus_b2);
-    secp256k1_scalar_set_b32(&ret->g1, secp256k1_scalar_consts_g1, NULL);
-    secp256k1_scalar_set_b32(&ret->g2, secp256k1_scalar_consts_g2, NULL);
-#endif
-
-    /* Set the global pointer. */
-    secp256k1_scalar_consts = ret;
-}
-
-static void secp256k1_scalar_stop(void) {
-    if (secp256k1_scalar_consts == NULL)
-        return;
-
-    secp256k1_scalar_consts_t *c = (secp256k1_scalar_consts_t*)secp256k1_scalar_consts;
-    secp256k1_scalar_consts = NULL;
-    free(c);
-}
-
 #ifndef USE_NUM_NONE
 static void secp256k1_scalar_get_num(secp256k1_num_t *r, const secp256k1_scalar_t *a) {
    unsigned char c[32];
@ -146,12 +31,21 @@ static void secp256k1_scalar_get_num(secp256k1_num_t *r, const secp256k1_scalar_
    secp256k1_num_set_bin(r, c, 32);
 }

+/** secp256k1 curve order, see secp256k1_ecdsa_const_order_as_fe in ecdsa_impl.h */
 static void secp256k1_scalar_order_get_num(secp256k1_num_t *r) {
-    *r = secp256k1_scalar_consts->order;
+    static const unsigned char order[32] = {
+        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFE,
+        0xBA,0xAE,0xDC,0xE6,0xAF,0x48,0xA0,0x3B,
+        0xBF,0xD2,0x5E,0x8C,0xD0,0x36,0x41,0x41
+    };
+    secp256k1_num_set_bin(r, order, 32);
 }
 #endif

 static void secp256k1_scalar_inverse(secp256k1_scalar_t *r, const secp256k1_scalar_t *x) {
+    secp256k1_scalar_t *t;
+    int i;
    /* First compute x ^ (2^N - 1) for some values of N. */
    secp256k1_scalar_t x2, x3, x4, x6, x7, x8, x15, x30, x60, x120, x127;

@ -175,129 +69,129 @@ static void secp256k1_scalar_inverse(secp256k1_scalar_t *r, const secp256k1_scal
    secp256k1_scalar_mul(&x8, &x8,  x);

    secp256k1_scalar_sqr(&x15, &x8);
-    for (int i=0; i<6; i++)
+    for (i = 0; i < 6; i++)
        secp256k1_scalar_sqr(&x15, &x15);
    secp256k1_scalar_mul(&x15, &x15, &x7);

    secp256k1_scalar_sqr(&x30, &x15);
-    for (int i=0; i<14; i++)
+    for (i = 0; i < 14; i++)
        secp256k1_scalar_sqr(&x30, &x30);
    secp256k1_scalar_mul(&x30, &x30, &x15);

    secp256k1_scalar_sqr(&x60, &x30);
-    for (int i=0; i<29; i++)
+    for (i = 0; i < 29; i++)
        secp256k1_scalar_sqr(&x60, &x60);
    secp256k1_scalar_mul(&x60, &x60, &x30);

    secp256k1_scalar_sqr(&x120, &x60);
-    for (int i=0; i<59; i++)
+    for (i = 0; i < 59; i++)
        secp256k1_scalar_sqr(&x120, &x120);
    secp256k1_scalar_mul(&x120, &x120, &x60);

    secp256k1_scalar_sqr(&x127, &x120);
-    for (int i=0; i<6; i++)
+    for (i = 0; i < 6; i++)
        secp256k1_scalar_sqr(&x127, &x127);
    secp256k1_scalar_mul(&x127, &x127, &x7);

    /* Then accumulate the final result (t starts at x127). */
-    secp256k1_scalar_t *t = &x127;
-    for (int i=0; i<2; i++) /* 0 */
+    t = &x127;
+    for (i = 0; i < 2; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<4; i++) /* 0 */
+    for (i = 0; i < 4; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (int i=0; i<2; i++) /* 0 */
+    for (i = 0; i < 2; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<2; i++) /* 0 */
+    for (i = 0; i < 2; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<2; i++) /* 0 */
+    for (i = 0; i < 2; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<4; i++) /* 0 */
+    for (i = 0; i < 4; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (int i=0; i<3; i++) /* 0 */
+    for (i = 0; i < 3; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x2); /* 11 */
-    for (int i=0; i<4; i++) /* 0 */
+    for (i = 0; i < 4; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (int i=0; i<5; i++) /* 00 */
+    for (i = 0; i < 5; i++) /* 00 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (int i=0; i<4; i++) /* 00 */
+    for (i = 0; i < 4; i++) /* 00 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x2); /* 11 */
-    for (int i=0; i<2; i++) /* 0 */
+    for (i = 0; i < 2; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<2; i++) /* 0 */
+    for (i = 0; i < 2; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<5; i++) /* 0 */
+    for (i = 0; i < 5; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x4); /* 1111 */
-    for (int i=0; i<2; i++) /* 0 */
+    for (i = 0; i < 2; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<3; i++) /* 00 */
+    for (i = 0; i < 3; i++) /* 00 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<4; i++) /* 000 */
+    for (i = 0; i < 4; i++) /* 000 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<2; i++) /* 0 */
+    for (i = 0; i < 2; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<10; i++) /* 0000000 */
+    for (i = 0; i < 10; i++) /* 0000000 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (int i=0; i<4; i++) /* 0 */
+    for (i = 0; i < 4; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (int i=0; i<9; i++) /* 0 */
+    for (i = 0; i < 9; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x8); /* 11111111 */
-    for (int i=0; i<2; i++) /* 0 */
+    for (i = 0; i < 2; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<3; i++) /* 00 */
+    for (i = 0; i < 3; i++) /* 00 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<3; i++) /* 00 */
+    for (i = 0; i < 3; i++) /* 00 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<5; i++) /* 0 */
+    for (i = 0; i < 5; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x4); /* 1111 */
-    for (int i=0; i<2; i++) /* 0 */
+    for (i = 0; i < 2; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<5; i++) /* 000 */
+    for (i = 0; i < 5; i++) /* 000 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x2); /* 11 */
-    for (int i=0; i<4; i++) /* 00 */
+    for (i = 0; i < 4; i++) /* 00 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x2); /* 11 */
-    for (int i=0; i<2; i++) /* 0 */
+    for (i = 0; i < 2; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<8; i++) /* 000000 */
+    for (i = 0; i < 8; i++) /* 000000 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x2); /* 11 */
-    for (int i=0; i<3; i++) /* 0 */
+    for (i = 0; i < 3; i++) /* 0 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, &x2); /* 11 */
-    for (int i=0; i<3; i++) /* 00 */
+    for (i = 0; i < 3; i++) /* 00 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<6; i++) /* 00000 */
+    for (i = 0; i < 6; i++) /* 00000 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (int i=0; i<8; i++) /* 00 */
+    for (i = 0; i < 8; i++) /* 00 */
        secp256k1_scalar_sqr(t, t);
    secp256k1_scalar_mul(r, t, &x6); /* 111111 */
 }
@ -307,10 +201,11 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar_t *r, const secp256k1_
    secp256k1_scalar_inverse(r, x);
 #elif defined(USE_SCALAR_INV_NUM)
    unsigned char b[32];
+    secp256k1_num_t n, m;
    secp256k1_scalar_get_b32(b, x);
-    secp256k1_num_t n;
    secp256k1_num_set_bin(&n, b, 32);
-    secp256k1_num_mod_inverse(&n, &n, &secp256k1_scalar_consts->order);
+    secp256k1_scalar_order_get_num(&m);
+    secp256k1_num_mod_inverse(&n, &n, &m);
    secp256k1_num_get_bin(b, 32, &n);
    secp256k1_scalar_set_b32(r, b, NULL);
 #else
@ -319,16 +214,74 @@ static void secp256k1_scalar_inverse_var(secp256k1_scalar_t *r, const secp256k1_
 }

 #ifdef USE_ENDOMORPHISM
+/**
+ * The Secp256k1 curve has an endomorphism, where lambda * (x, y) = (beta * x, y), where
+ * lambda is {0x53,0x63,0xad,0x4c,0xc0,0x5c,0x30,0xe0,0xa5,0x26,0x1c,0x02,0x88,0x12,0x64,0x5a,
+ *            0x12,0x2e,0x22,0xea,0x20,0x81,0x66,0x78,0xdf,0x02,0x96,0x7c,0x1b,0x23,0xbd,0x72}
+ *
+ * "Guide to Elliptic Curve Cryptography" (Hankerson, Menezes, Vanstone) gives an algorithm
+ * (algorithm 3.74) to find k1 and k2 given k, such that k1 + k2 * lambda == k mod n, and k1
+ * and k2 have a small size.
+ * It relies on constants a1, b1, a2, b2. These constants for the value of lambda above are:
+ *
+ * - a1 =      {0x30,0x86,0xd2,0x21,0xa7,0xd4,0x6b,0xcd,0xe8,0x6c,0x90,0xe4,0x92,0x84,0xeb,0x15}
+ * - b1 =     -{0xe4,0x43,0x7e,0xd6,0x01,0x0e,0x88,0x28,0x6f,0x54,0x7f,0xa9,0x0a,0xbf,0xe4,0xc3}
+ * - a2 = {0x01,0x14,0xca,0x50,0xf7,0xa8,0xe2,0xf3,0xf6,0x57,0xc1,0x10,0x8d,0x9d,0x44,0xcf,0xd8}
+ * - b2 =      {0x30,0x86,0xd2,0x21,0xa7,0xd4,0x6b,0xcd,0xe8,0x6c,0x90,0xe4,0x92,0x84,0xeb,0x15}
+ *
+ * The algorithm then computes c1 = round(b1 * k / n) and c2 = round(b2 * k / n), and gives
+ * k1 = k - (c1*a1 + c2*a2) and k2 = -(c1*b1 + c2*b2). Instead, we use modular arithmetic, and
+ * compute k1 as k - k2 * lambda, avoiding the need for constants a1 and a2.
+ *
+ * g1, g2 are precomputed constants used to replace division with a rounded multiplication
+ * when decomposing the scalar for an endomorphism-based point multiplication.
+ *
+ * The possibility of using precomputed estimates is mentioned in "Guide to Elliptic Curve
+ * Cryptography" (Hankerson, Menezes, Vanstone) in section 3.5.
+ *
+ * The derivation is described in the paper "Efficient Software Implementation of Public-Key
+ * Cryptography on Sensor Networks Using the MSP430X Microcontroller" (Gouvea, Oliveira, Lopez),
+ * Section 4.3 (here we use a somewhat higher-precision estimate):
+ * d = a1*b2 - b1*a2
+ * g1 = round((2^272)*b2/d)
+ * g2 = round((2^272)*b1/d)
+ *
+ * (Note that 'd' is also equal to the curve order here because [a1,b1] and [a2,b2] are found
+ * as outputs of the Extended Euclidean Algorithm on inputs 'order' and 'lambda').
+ *
+ * The function below splits a in r1 and r2, such that r1 + lambda * r2 == a (mod order).
+ */
+
 static void secp256k1_scalar_split_lambda_var(secp256k1_scalar_t *r1, secp256k1_scalar_t *r2, const secp256k1_scalar_t *a) {
+    secp256k1_scalar_t c1, c2;
+    static const secp256k1_scalar_t minus_lambda = SECP256K1_SCALAR_CONST(
+        0xAC9C52B3UL, 0x3FA3CF1FUL, 0x5AD9E3FDUL, 0x77ED9BA4UL,
+        0xA880B9FCUL, 0x8EC739C2UL, 0xE0CFC810UL, 0xB51283CFUL
+    );
+    static const secp256k1_scalar_t minus_b1 = SECP256K1_SCALAR_CONST(
+        0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL,
+        0xE4437ED6UL, 0x010E8828UL, 0x6F547FA9UL, 0x0ABFE4C3UL
+    );
+    static const secp256k1_scalar_t minus_b2 = SECP256K1_SCALAR_CONST(
+        0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFEUL,
+        0x8A280AC5UL, 0x0774346DUL, 0xD765CDA8UL, 0x3DB1562CUL
+    );
+    static const secp256k1_scalar_t g1 = SECP256K1_SCALAR_CONST(
+        0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00003086UL,
+        0xD221A7D4UL, 0x6BCDE86CUL, 0x90E49284UL, 0xEB153DABUL
+    );
+    static const secp256k1_scalar_t g2 = SECP256K1_SCALAR_CONST(
+        0x00000000UL, 0x00000000UL, 0x00000000UL, 0x0000E443UL,
+        0x7ED6010EUL, 0x88286F54UL, 0x7FA90ABFUL, 0xE4C42212UL
+    );
    VERIFY_CHECK(r1 != a);
    VERIFY_CHECK(r2 != a);
-    secp256k1_scalar_t c1, c2;
-    secp256k1_scalar_mul_shift_var(&c1, a, &secp256k1_scalar_consts->g1, 272);
-    secp256k1_scalar_mul_shift_var(&c2, a, &secp256k1_scalar_consts->g2, 272);
-    secp256k1_scalar_mul(&c1, &c1, &secp256k1_scalar_consts->minus_b1);
-    secp256k1_scalar_mul(&c2, &c2, &secp256k1_scalar_consts->minus_b2);
+    secp256k1_scalar_mul_shift_var(&c1, a, &g1, 272);
+    secp256k1_scalar_mul_shift_var(&c2, a, &g2, 272);
+    secp256k1_scalar_mul(&c1, &c1, &minus_b1);
+    secp256k1_scalar_mul(&c2, &c2, &minus_b2);
    secp256k1_scalar_add(r2, &c1, &c2);
-    secp256k1_scalar_mul(r1, r2, &secp256k1_scalar_consts->minus_lambda);
+    secp256k1_scalar_mul(r1, r2, &minus_lambda);
    secp256k1_scalar_add(r1, r1, a);
 }
 #endif
--- a/src/secp256k1.c
+++ b/src/secp256k1.c
@ -20,10 +20,6 @@
 #include "hash_impl.h"

 void secp256k1_start(unsigned int flags) {
-    secp256k1_fe_start();
-    secp256k1_ge_start();
-    secp256k1_scalar_start();
-    secp256k1_ecdsa_start();
    if (flags & SECP256K1_START_SIGN) {
        secp256k1_ecmult_gen_start();
    }
@ -35,46 +31,43 @@ void secp256k1_start(unsigned int flags) {
 void secp256k1_stop(void) {
    secp256k1_ecmult_stop();
    secp256k1_ecmult_gen_stop();
-    secp256k1_ecdsa_stop();
-    secp256k1_scalar_stop();
-    secp256k1_ge_stop();
-    secp256k1_fe_stop();
 }

 int secp256k1_ecdsa_verify(const unsigned char *msg32, const unsigned char *sig, int siglen, const unsigned char *pubkey, int pubkeylen) {
+    secp256k1_ge_t q;
+    secp256k1_ecdsa_sig_t s;
+    secp256k1_scalar_t m;
+    int ret = -3;
    DEBUG_CHECK(secp256k1_ecmult_consts != NULL);
    DEBUG_CHECK(msg32 != NULL);
    DEBUG_CHECK(sig != NULL);
    DEBUG_CHECK(pubkey != NULL);

-    int ret = -3;
-    secp256k1_scalar_t m;
-    secp256k1_ecdsa_sig_t s;
-    secp256k1_ge_t q;
    secp256k1_scalar_set_b32(&m, msg32, NULL);

-    if (!secp256k1_eckey_pubkey_parse(&q, pubkey, pubkeylen)) {
+    if (secp256k1_eckey_pubkey_parse(&q, pubkey, pubkeylen)) {
+        if (secp256k1_ecdsa_sig_parse(&s, sig, siglen)) {
+            if (secp256k1_ecdsa_sig_verify(&s, &q, &m)) {
+                /* success is 1, all other values are fail */
+                ret = 1;
+            } else {
+                ret = 0;
+            }
+        } else {
+            ret = -2;
+        }
+    } else {
        ret = -1;
-        goto end;
-    }
-    if (!secp256k1_ecdsa_sig_parse(&s, sig, siglen)) {
-        ret = -2;
-        goto end;
    }
-    if (!secp256k1_ecdsa_sig_verify(&s, &q, &m)) {
-        ret = 0;
-        goto end;
-    }
-    ret = 1;
-end:
+
    return ret;
 }

 static int nonce_function_rfc6979(unsigned char *nonce32, const unsigned char *msg32, const unsigned char *key32, unsigned int counter, const void *data) {
-   (void)data;
   secp256k1_rfc6979_hmac_sha256_t rng;
-   secp256k1_rfc6979_hmac_sha256_initialize(&rng, key32, 32, msg32, 32);
-   for (unsigned int i = 0; i <= counter; i++) {
+   unsigned int i;
+   secp256k1_rfc6979_hmac_sha256_initialize(&rng, key32, 32, msg32, 32, data, data != NULL ? 32 : 0);
+   for (i = 0; i <= counter; i++) {
       secp256k1_rfc6979_hmac_sha256_generate(&rng, nonce32, 32);
   }
   secp256k1_rfc6979_hmac_sha256_finalize(&rng);
@ -85,6 +78,11 @@ const secp256k1_nonce_function_t secp256k1_nonce_function_rfc6979 = nonce_functi
 const secp256k1_nonce_function_t secp256k1_nonce_function_default = nonce_function_rfc6979;

 int secp256k1_ecdsa_sign(const unsigned char *msg32, unsigned char *signature, int *signaturelen, const unsigned char *seckey, secp256k1_nonce_function_t noncefp, const void* noncedata) {
+    secp256k1_ecdsa_sig_t sig;
+    secp256k1_scalar_t sec, non, msg;
+    int ret = 0;
+    int overflow = 0;
+    unsigned int count = 0;
    DEBUG_CHECK(secp256k1_ecmult_gen_consts != NULL);
    DEBUG_CHECK(msg32 != NULL);
    DEBUG_CHECK(signature != NULL);
@ -94,38 +92,44 @@ int secp256k1_ecdsa_sign(const unsigned char *msg32, unsigned char *signature, i
        noncefp = secp256k1_nonce_function_default;
    }

-    secp256k1_scalar_t sec, non, msg;
-    secp256k1_scalar_set_b32(&sec, seckey, NULL);
-    secp256k1_scalar_set_b32(&msg, msg32, NULL);
-    int overflow = 0;
-    int ret = 0;
-    unsigned int count = 0;
-    secp256k1_ecdsa_sig_t sig;
-    while (1) {
-        unsigned char nonce32[32];
-        ret = noncefp(nonce32, msg32, seckey, count, noncedata);
-        if (!ret) {
-            break;
-        }
-        secp256k1_scalar_set_b32(&non, nonce32, &overflow);
-        memset(nonce32, 0, 32);
-        if (!secp256k1_scalar_is_zero(&non) && !overflow) {
-            if (secp256k1_ecdsa_sig_sign(&sig, &sec, &msg, &non, NULL)) {
+    secp256k1_scalar_set_b32(&sec, seckey, &overflow);
+    /* Fail if the secret key is invalid. */
+    if (!overflow && !secp256k1_scalar_is_zero(&sec)) {
+        secp256k1_scalar_set_b32(&msg, msg32, NULL);
+        while (1) {
+            unsigned char nonce32[32];
+            ret = noncefp(nonce32, msg32, seckey, count, noncedata);
+            if (!ret) {
                break;
            }
+            secp256k1_scalar_set_b32(&non, nonce32, &overflow);
+            memset(nonce32, 0, 32);
+            if (!secp256k1_scalar_is_zero(&non) && !overflow) {
+                if (secp256k1_ecdsa_sig_sign(&sig, &sec, &msg, &non, NULL)) {
+                    break;
+                }
+            }
+            count++;
+        }
+        if (ret) {
+            ret = secp256k1_ecdsa_sig_serialize(signature, signaturelen, &sig);
        }
-        count++;
+        secp256k1_scalar_clear(&msg);
+        secp256k1_scalar_clear(&non);
+        secp256k1_scalar_clear(&sec);
    }
-    if (ret) {
-        ret = secp256k1_ecdsa_sig_serialize(signature, signaturelen, &sig);
+    if (!ret) {
+        *signaturelen = 0;
    }
-    secp256k1_scalar_clear(&msg);
-    secp256k1_scalar_clear(&non);
-    secp256k1_scalar_clear(&sec);
    return ret;
 }

 int secp256k1_ecdsa_sign_compact(const unsigned char *msg32, unsigned char *sig64, const unsigned char *seckey, secp256k1_nonce_function_t noncefp, const void* noncedata, int *recid) {
+    secp256k1_ecdsa_sig_t sig;
+    secp256k1_scalar_t sec, non, msg;
+    int ret = 0;
+    int overflow = 0;
+    unsigned int count = 0;
    DEBUG_CHECK(secp256k1_ecmult_gen_consts != NULL);
    DEBUG_CHECK(msg32 != NULL);
    DEBUG_CHECK(sig64 != NULL);
@ -134,39 +138,45 @@ int secp256k1_ecdsa_sign_compact(const unsigned char *msg32, unsigned char *sig6
        noncefp = secp256k1_nonce_function_default;
    }

-    secp256k1_scalar_t sec, non, msg;
-    secp256k1_scalar_set_b32(&sec, seckey, NULL);
-    secp256k1_scalar_set_b32(&msg, msg32, NULL);
-    int overflow = 0;
-    int ret = 0;
-    unsigned int count = 0;
-    secp256k1_ecdsa_sig_t sig;
-    while (1) {
-        unsigned char nonce32[32];
-        ret = noncefp(nonce32, msg32, seckey, count, noncedata);
-        if (!ret) {
-            break;
-        }
-        secp256k1_scalar_set_b32(&non, nonce32, &overflow);
-        memset(nonce32, 0, 32);
-        if (!secp256k1_scalar_is_zero(&non) && !overflow) {
-            if (secp256k1_ecdsa_sig_sign(&sig, &sec, &msg, &non, recid)) {
+    secp256k1_scalar_set_b32(&sec, seckey, &overflow);
+    /* Fail if the secret key is invalid. */
+    if (!overflow && !secp256k1_scalar_is_zero(&sec)) {
+        secp256k1_scalar_set_b32(&msg, msg32, NULL);
+        while (1) {
+            unsigned char nonce32[32];
+            ret = noncefp(nonce32, msg32, seckey, count, noncedata);
+            if (!ret) {
                break;
            }
+            secp256k1_scalar_set_b32(&non, nonce32, &overflow);
+            memset(nonce32, 0, 32);
+            if (!secp256k1_scalar_is_zero(&non) && !overflow) {
+                if (secp256k1_ecdsa_sig_sign(&sig, &sec, &msg, &non, recid)) {
+                    break;
+                }
+            }
+            count++;
+        }
+        if (ret) {
+            secp256k1_scalar_get_b32(sig64, &sig.r);
+            secp256k1_scalar_get_b32(sig64 + 32, &sig.s);
        }
-        count++;
+        secp256k1_scalar_clear(&msg);
+        secp256k1_scalar_clear(&non);
+        secp256k1_scalar_clear(&sec);
    }
-    if (ret) {
-        secp256k1_scalar_get_b32(sig64, &sig.r);
-        secp256k1_scalar_get_b32(sig64 + 32, &sig.s);
+    if (!ret) {
+        memset(sig64, 0, 64);
    }
-    secp256k1_scalar_clear(&msg);
-    secp256k1_scalar_clear(&non);
-    secp256k1_scalar_clear(&sec);
    return ret;
 }

 int secp256k1_ecdsa_recover_compact(const unsigned char *msg32, const unsigned char *sig64, unsigned char *pubkey, int *pubkeylen, int compressed, int recid) {
+    secp256k1_ge_t q;
+    secp256k1_ecdsa_sig_t sig;
+    secp256k1_scalar_t m;
+    int ret = 0;
+    int overflow = 0;
    DEBUG_CHECK(secp256k1_ecmult_consts != NULL);
    DEBUG_CHECK(msg32 != NULL);
    DEBUG_CHECK(sig64 != NULL);
@ -174,82 +184,87 @@ int secp256k1_ecdsa_recover_compact(const unsigned char *msg32, const unsigned c
    DEBUG_CHECK(pubkeylen != NULL);
    DEBUG_CHECK(recid >= 0 && recid <= 3);

-    int ret = 0;
-    secp256k1_scalar_t m;
-    secp256k1_ecdsa_sig_t sig;
-    int overflow = 0;
    secp256k1_scalar_set_b32(&sig.r, sig64, &overflow);
-    if (overflow) {
-        return 0;
-    }
-    secp256k1_scalar_set_b32(&sig.s, sig64 + 32, &overflow);
-    if (overflow) {
-        return 0;
-    }
-    secp256k1_scalar_set_b32(&m, msg32, NULL);
+    if (!overflow) {
+        secp256k1_scalar_set_b32(&sig.s, sig64 + 32, &overflow);
+        if (!overflow) {
+            secp256k1_scalar_set_b32(&m, msg32, NULL);

-    secp256k1_ge_t q;
-    if (secp256k1_ecdsa_sig_recover(&sig, &q, &m, recid)) {
-        ret = secp256k1_eckey_pubkey_serialize(&q, pubkey, pubkeylen, compressed);
+            if (secp256k1_ecdsa_sig_recover(&sig, &q, &m, recid)) {
+                ret = secp256k1_eckey_pubkey_serialize(&q, pubkey, pubkeylen, compressed);
+            }
+        }
    }
    return ret;
 }

 int secp256k1_ec_seckey_verify(const unsigned char *seckey) {
-    DEBUG_CHECK(seckey != NULL);
-
    secp256k1_scalar_t sec;
+    int ret;
    int overflow;
+    DEBUG_CHECK(seckey != NULL);
+
    secp256k1_scalar_set_b32(&sec, seckey, &overflow);
-    int ret = !secp256k1_scalar_is_zero(&sec) && !overflow;
+    ret = !secp256k1_scalar_is_zero(&sec) && !overflow;
    secp256k1_scalar_clear(&sec);
    return ret;
 }

 int secp256k1_ec_pubkey_verify(const unsigned char *pubkey, int pubkeylen) {
+    secp256k1_ge_t q;
    DEBUG_CHECK(pubkey != NULL);

-    secp256k1_ge_t q;
    return secp256k1_eckey_pubkey_parse(&q, pubkey, pubkeylen);
 }

 int secp256k1_ec_pubkey_create(unsigned char *pubkey, int *pubkeylen, const unsigned char *seckey, int compressed) {
+    secp256k1_gej_t pj;
+    secp256k1_ge_t p;
+    secp256k1_scalar_t sec;
+    int overflow;
+    int ret = 0;
    DEBUG_CHECK(secp256k1_ecmult_gen_consts != NULL);
    DEBUG_CHECK(pubkey != NULL);
    DEBUG_CHECK(pubkeylen != NULL);
    DEBUG_CHECK(seckey != NULL);

-    secp256k1_scalar_t sec;
-    secp256k1_scalar_set_b32(&sec, seckey, NULL);
-    secp256k1_gej_t pj;
-    secp256k1_ecmult_gen(&pj, &sec);
-    secp256k1_scalar_clear(&sec);
-    secp256k1_ge_t p;
-    secp256k1_ge_set_gej(&p, &pj);
-    return secp256k1_eckey_pubkey_serialize(&p, pubkey, pubkeylen, compressed);
+    secp256k1_scalar_set_b32(&sec, seckey, &overflow);
+    if (!overflow) {
+        secp256k1_ecmult_gen(&pj, &sec);
+        secp256k1_scalar_clear(&sec);
+        secp256k1_ge_set_gej(&p, &pj);
+        ret = secp256k1_eckey_pubkey_serialize(&p, pubkey, pubkeylen, compressed);
+    }
+    if (!ret) {
+        *pubkeylen = 0;
+    }
+    return ret;
 }

 int secp256k1_ec_pubkey_decompress(unsigned char *pubkey, int *pubkeylen) {
+    secp256k1_ge_t p;
+    int ret = 0;
    DEBUG_CHECK(pubkey != NULL);
    DEBUG_CHECK(pubkeylen != NULL);

-    secp256k1_ge_t p;
-    if (!secp256k1_eckey_pubkey_parse(&p, pubkey, *pubkeylen))
-        return 0;
-    return secp256k1_eckey_pubkey_serialize(&p, pubkey, pubkeylen, 0);
+    if (secp256k1_eckey_pubkey_parse(&p, pubkey, *pubkeylen)) {
+        ret = secp256k1_eckey_pubkey_serialize(&p, pubkey, pubkeylen, 0);
+    }
+    return ret;
 }

 int secp256k1_ec_privkey_tweak_add(unsigned char *seckey, const unsigned char *tweak) {
+    secp256k1_scalar_t term;
+    secp256k1_scalar_t sec;
+    int ret = 0;
+    int overflow = 0;
    DEBUG_CHECK(seckey != NULL);
    DEBUG_CHECK(tweak != NULL);

-    secp256k1_scalar_t term;
-    int overflow = 0;
    secp256k1_scalar_set_b32(&term, tweak, &overflow);
-    secp256k1_scalar_t sec;
    secp256k1_scalar_set_b32(&sec, seckey, NULL);

-    int ret = secp256k1_eckey_privkey_tweak_add(&sec, &term) && !overflow;
+    ret = secp256k1_eckey_privkey_tweak_add(&sec, &term) && !overflow;
    if (ret) {
        secp256k1_scalar_get_b32(seckey, &sec);
    }
@ -260,40 +275,41 @@ int secp256k1_ec_privkey_tweak_add(unsigned char *seckey, const unsigned char *t
 }

 int secp256k1_ec_pubkey_tweak_add(unsigned char *pubkey, int pubkeylen, const unsigned char *tweak) {
+    secp256k1_ge_t p;
+    secp256k1_scalar_t term;
+    int ret = 0;
+    int overflow = 0;
    DEBUG_CHECK(secp256k1_ecmult_consts != NULL);
    DEBUG_CHECK(pubkey != NULL);
    DEBUG_CHECK(tweak != NULL);

-    secp256k1_scalar_t term;
-    int overflow = 0;
    secp256k1_scalar_set_b32(&term, tweak, &overflow);
-    if (overflow) {
-        return 0;
-    }
-    secp256k1_ge_t p;
-    int ret = secp256k1_eckey_pubkey_parse(&p, pubkey, pubkeylen);
-    if (ret) {
-        ret = secp256k1_eckey_pubkey_tweak_add(&p, &term);
-    }
-    if (ret) {
-        int oldlen = pubkeylen;
-        ret = secp256k1_eckey_pubkey_serialize(&p, pubkey, &pubkeylen, oldlen <= 33);
-        VERIFY_CHECK(pubkeylen == oldlen);
+    if (!overflow) {
+        ret = secp256k1_eckey_pubkey_parse(&p, pubkey, pubkeylen);
+        if (ret) {
+            ret = secp256k1_eckey_pubkey_tweak_add(&p, &term);
+        }
+        if (ret) {
+            int oldlen = pubkeylen;
+            ret = secp256k1_eckey_pubkey_serialize(&p, pubkey, &pubkeylen, oldlen <= 33);
+            VERIFY_CHECK(pubkeylen == oldlen);
+        }
    }

    return ret;
 }

 int secp256k1_ec_privkey_tweak_mul(unsigned char *seckey, const unsigned char *tweak) {
+    secp256k1_scalar_t factor;
+    secp256k1_scalar_t sec;
+    int ret = 0;
+    int overflow = 0;
    DEBUG_CHECK(seckey != NULL);
    DEBUG_CHECK(tweak != NULL);

-    secp256k1_scalar_t factor;
-    int overflow = 0;
    secp256k1_scalar_set_b32(&factor, tweak, &overflow);
-    secp256k1_scalar_t sec;
    secp256k1_scalar_set_b32(&sec, seckey, NULL);
-    int ret = secp256k1_eckey_privkey_tweak_mul(&sec, &factor) && !overflow;
+    ret = secp256k1_eckey_privkey_tweak_mul(&sec, &factor) && !overflow;
    if (ret) {
        secp256k1_scalar_get_b32(seckey, &sec);
    }
@ -304,50 +320,53 @@ int secp256k1_ec_privkey_tweak_mul(unsigned char *seckey, const unsigned char *t
 }

 int secp256k1_ec_pubkey_tweak_mul(unsigned char *pubkey, int pubkeylen, const unsigned char *tweak) {
+    secp256k1_ge_t p;
+    secp256k1_scalar_t factor;
+    int ret = 0;
+    int overflow = 0;
    DEBUG_CHECK(secp256k1_ecmult_consts != NULL);
    DEBUG_CHECK(pubkey != NULL);
    DEBUG_CHECK(tweak != NULL);

-    secp256k1_scalar_t factor;
-    int overflow = 0;
    secp256k1_scalar_set_b32(&factor, tweak, &overflow);
-    if (overflow) {
-        return 0;
-    }
-    secp256k1_ge_t p;
-    int ret = secp256k1_eckey_pubkey_parse(&p, pubkey, pubkeylen);
-    if (ret) {
-        ret = secp256k1_eckey_pubkey_tweak_mul(&p, &factor);
-    }
-    if (ret) {
-        int oldlen = pubkeylen;
-        ret = secp256k1_eckey_pubkey_serialize(&p, pubkey, &pubkeylen, oldlen <= 33);
-        VERIFY_CHECK(pubkeylen == oldlen);
+    if (!overflow) {
+        ret = secp256k1_eckey_pubkey_parse(&p, pubkey, pubkeylen);
+        if (ret) {
+            ret = secp256k1_eckey_pubkey_tweak_mul(&p, &factor);
+        }
+        if (ret) {
+            int oldlen = pubkeylen;
+            ret = secp256k1_eckey_pubkey_serialize(&p, pubkey, &pubkeylen, oldlen <= 33);
+            VERIFY_CHECK(pubkeylen == oldlen);
+        }
    }

    return ret;
 }

 int secp256k1_ec_privkey_export(const unsigned char *seckey, unsigned char *privkey, int *privkeylen, int compressed) {
+    secp256k1_scalar_t key;
+    int ret = 0;
    DEBUG_CHECK(seckey != NULL);
    DEBUG_CHECK(privkey != NULL);
    DEBUG_CHECK(privkeylen != NULL);

-    secp256k1_scalar_t key;
    secp256k1_scalar_set_b32(&key, seckey, NULL);
-    int ret = secp256k1_eckey_privkey_serialize(privkey, privkeylen, &key, compressed);
+    ret = secp256k1_eckey_privkey_serialize(privkey, privkeylen, &key, compressed);
    secp256k1_scalar_clear(&key);
    return ret;
 }

 int secp256k1_ec_privkey_import(unsigned char *seckey, const unsigned char *privkey, int privkeylen) {
+    secp256k1_scalar_t key;
+    int ret = 0;
    DEBUG_CHECK(seckey != NULL);
    DEBUG_CHECK(privkey != NULL);

-    secp256k1_scalar_t key;
-    int ret = secp256k1_eckey_privkey_parse(&key, privkey, privkeylen);
-    if (ret)
+    ret = secp256k1_eckey_privkey_parse(&key, privkey, privkeylen);
+    if (ret) {
        secp256k1_scalar_get_b32(seckey, &key);
+    }
    secp256k1_scalar_clear(&key);
    return ret;
 }
--- a/src/testrand.h
+++ b/src/testrand.h
@ -11,8 +11,10 @@
 #include "libsecp256k1-config.h"
 #endif

-/** Seed the pseudorandom number generator. */
-SECP256K1_INLINE static void secp256k1_rand_seed(uint64_t v);
+/* A non-cryptographic RNG used only for test infrastructure. */
+
+/** Seed the pseudorandom number generator for testing. */
+SECP256K1_INLINE static void secp256k1_rand_seed(const unsigned char *seed16);

 /** Generate a pseudorandom 32-bit number. */
 static uint32_t secp256k1_rand32(void);
--- a/src/testrand_impl.h
+++ b/src/testrand_impl.h
@ -11,44 +11,44 @@
 #include <string.h>

 #include "testrand.h"
+#include "hash.h"

-static uint32_t secp256k1_Rz = 11, secp256k1_Rw = 11;
+static secp256k1_rfc6979_hmac_sha256_t secp256k1_test_rng;
+static uint32_t secp256k1_test_rng_precomputed[8];
+static int secp256k1_test_rng_precomputed_used = 8;

-SECP256K1_INLINE static void secp256k1_rand_seed(uint64_t v) {
-    secp256k1_Rz = v >> 32;
-    secp256k1_Rw = v;
-
-    if (secp256k1_Rz == 0 || secp256k1_Rz == 0x9068ffffU) {
-        secp256k1_Rz = 111;
-    }
-    if (secp256k1_Rw == 0 || secp256k1_Rw == 0x464fffffU) {
-        secp256k1_Rw = 111;
-    }
+SECP256K1_INLINE static void secp256k1_rand_seed(const unsigned char *seed16) {
+    secp256k1_rfc6979_hmac_sha256_initialize(&secp256k1_test_rng, (const unsigned char*)"TestRNG", 7, seed16, 16, NULL, 0);
 }

 SECP256K1_INLINE static uint32_t secp256k1_rand32(void) {
-    secp256k1_Rz = 36969 * (secp256k1_Rz & 0xFFFF) + (secp256k1_Rz >> 16);
-    secp256k1_Rw = 18000 * (secp256k1_Rw & 0xFFFF) + (secp256k1_Rw >> 16);
-    return (secp256k1_Rw << 16) + (secp256k1_Rw >> 16) + secp256k1_Rz;
+    if (secp256k1_test_rng_precomputed_used == 8) {
+        secp256k1_rfc6979_hmac_sha256_generate(&secp256k1_test_rng, (unsigned char*)(&secp256k1_test_rng_precomputed[0]), sizeof(secp256k1_test_rng_precomputed));
+        secp256k1_test_rng_precomputed_used = 0;
+    }
+    return secp256k1_test_rng_precomputed[secp256k1_test_rng_precomputed_used++];
 }

 static void secp256k1_rand256(unsigned char *b32) {
-    for (int i=0; i<8; i++) {
-        uint32_t r = secp256k1_rand32();
-        b32[i*4 + 0] = (r >>  0) & 0xFF;
-        b32[i*4 + 1] = (r >>  8) & 0xFF;
-        b32[i*4 + 2] = (r >> 16) & 0xFF;
-        b32[i*4 + 3] = (r >> 24) & 0xFF;
-    }
+    secp256k1_rfc6979_hmac_sha256_generate(&secp256k1_test_rng, b32, 32);
 }

 static void secp256k1_rand256_test(unsigned char *b32) {
    int bits=0;
+    uint64_t ent = 0;
+    int entleft = 0;
    memset(b32, 0, 32);
    while (bits < 256) {
-        uint32_t ent = secp256k1_rand32();
-        int now = 1 + ((ent % 64)*((ent >> 6) % 32)+16)/31;
-        uint32_t val = 1 & (ent >> 11);
+        int now;
+        uint32_t val;
+        if (entleft < 12) {
+            ent |= ((uint64_t)secp256k1_rand32()) << entleft;
+            entleft += 32;
+        }
+        now = 1 + ((ent % 64)*((ent >> 6) % 32)+16)/31;
+        val = 1 & (ent >> 11);
+        ent >>= 12;
+        entleft -= 12;
        while (now > 0 && bits < 256) {
            b32[bits / 8] |= val << (bits % 8);
            now--;
--- a/src/tests.c
+++ b/src/tests.c
--- a/src/util.h
+++ b/src/util.h
@ -27,7 +27,7 @@
 } while(0)
 #endif

-#ifndef HAVE_BUILTIN_EXPECT
+#ifdef HAVE_BUILTIN_EXPECT
 #define EXPECT(x,c) __builtin_expect((x),(c))
 #else
 #define EXPECT(x,c) (x)
@ -61,7 +61,7 @@
 #define VERIFY_CHECK(cond) do { (void)(cond); } while(0)
 #endif

-static inline void *checked_malloc(size_t size) {
+static SECP256K1_INLINE void *checked_malloc(size_t size) {
    void *ret = malloc(size);
    CHECK(ret != NULL);
    return ret;
@ -84,4 +84,21 @@ static inline void *checked_malloc(size_t size) {
 # endif
 #endif

+#if defined(_WIN32)
+# define I64FORMAT "I64d"
+# define I64uFORMAT "I64u"
+#else
+# define I64FORMAT "lld"
+# define I64uFORMAT "llu"
+#endif
+
+#if defined(HAVE___INT128)
+# if defined(__GNUC__)
+#  define SECP256K1_GNUC_EXT __extension__
+# else
+#  define SECP256K1_GNUC_EXT
+# endif
+SECP256K1_GNUC_EXT typedef unsigned __int128 uint128_t;
+#endif
+
 #endif