diff --git a/.travis.yml b/.travis.yml
index 3a85e8cba..28cd61dbc 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,12 +1,14 @@
-language: cpp
-compiler: gcc
+language: c
+compiler:
+  - clang
+  - gcc
 install:
   - sudo apt-get install -qq libssl-dev
-  - if [ "$BIGNUM" = "gmp" -o "$BIGNUM" = "auto" -o "$FIELD" = "gmp" ]; then sudo apt-get install -qq libgmp-dev; fi
-  - if [ "$FIELD" = "64bit_asm" ]; then sudo apt-get install -qq yasm; fi
+  - if [ "$BIGNUM" = "gmp" -o "$BIGNUM" = "auto" -o "$FIELD" = "gmp" ]; then sudo apt-get install --no-install-recommends --no-upgrade -qq libgmp-dev; fi
+  - if [ -n "$EXTRAPACKAGES" ]; then sudo apt-get update && sudo apt-get install --no-install-recommends --no-upgrade $EXTRAPACKAGES; fi
 env:
   global:
-    - FIELD=auto  BIGNUM=auto  SCALAR=auto  ENDOMORPHISM=no  BUILD=check  EXTRAFLAGS=
+    - FIELD=auto  BIGNUM=auto  SCALAR=auto  ENDOMORPHISM=no  BUILD=check  EXTRAFLAGS= HOST= EXTRAPACKAGES=
   matrix:
     - SCALAR=32bit
     - SCALAR=64bit
@@ -22,6 +24,11 @@ env:
     - BIGNUM=none     ENDOMORPHISM=yes
     - BUILD=distcheck
     - EXTRAFLAGS=CFLAGS=-DDETERMINISTIC
+    - HOST=i686-linux-gnu EXTRAPACKAGES="gcc-multilib"
+    - HOST=i686-linux-gnu EXTRAPACKAGES="gcc-multilib" ENDOMORPHISM=yes
 before_script: ./autogen.sh
-script: ./configure --enable-endomorphism=$ENDOMORPHISM --with-field=$FIELD --with-bignum=$BIGNUM --with-scalar=$SCALAR $EXTRAFLAGS && make -j2 $BUILD
+script:
+ - if [ -n "$HOST" ]; then export USE_HOST="--host=$HOST"; fi
+ - if [ "x$HOST" = "xi686-linux-gnu" ]; then export CC="$CC -m32"; fi
+ - ./configure --enable-endomorphism=$ENDOMORPHISM --with-field=$FIELD --with-bignum=$BIGNUM --with-scalar=$SCALAR $EXTRAFLAGS $USE_HOST && make -j2 $BUILD
 os: linux
diff --git a/Makefile.am b/Makefile.am
index dbf1790f3..390d2c9ff 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,12 +1,6 @@
 ACLOCAL_AMFLAGS = -I build-aux/m4
 
 lib_LTLIBRARIES = libsecp256k1.la
-if USE_ASM
-COMMON_LIB = libsecp256k1_common.la
-else
-COMMON_LIB =
-endif
-noinst_LTLIBRARIES = $(COMMON_LIB)
 include_HEADERS = include/secp256k1.h
 noinst_HEADERS =
 noinst_HEADERS += src/scalar.h
@@ -43,30 +37,30 @@ noinst_HEADERS += src/field_gmp.h
 noinst_HEADERS += src/field_gmp_impl.h
 noinst_HEADERS += src/field.h
 noinst_HEADERS += src/field_impl.h
+noinst_HEADERS += src/bench.h
 
 pkgconfigdir = $(libdir)/pkgconfig
 pkgconfig_DATA = libsecp256k1.pc
 
-if USE_ASM
-libsecp256k1_common_la_SOURCES = src/field_5x52_asm.asm
-endif
-
 libsecp256k1_la_SOURCES = src/secp256k1.c
 libsecp256k1_la_CPPFLAGS = -I$(top_srcdir)/include $(SECP_INCLUDES)
-libsecp256k1_la_LIBADD = $(COMMON_LIB) $(SECP_LIBS)
+libsecp256k1_la_LIBADD = $(SECP_LIBS)
 
 
 noinst_PROGRAMS =
 if USE_BENCHMARK
-noinst_PROGRAMS += bench_verify bench_sign bench_inv
+noinst_PROGRAMS += bench_verify bench_recover bench_sign bench_inv
 bench_verify_SOURCES = src/bench_verify.c
 bench_verify_LDADD = libsecp256k1.la $(SECP_LIBS)
 bench_verify_LDFLAGS = -static
+bench_recover_SOURCES = src/bench_recover.c
+bench_recover_LDADD = libsecp256k1.la $(SECP_LIBS)
+bench_recover_LDFLAGS = -static
 bench_sign_SOURCES = src/bench_sign.c
 bench_sign_LDADD = libsecp256k1.la $(SECP_LIBS)
 bench_sign_LDFLAGS = -static
 bench_inv_SOURCES = src/bench_inv.c
-bench_inv_LDADD = $(COMMON_LIB) $(SECP_LIBS)
+bench_inv_LDADD = $(SECP_LIBS)
 bench_inv_LDFLAGS = -static
 bench_inv_CPPFLAGS = $(SECP_INCLUDES)
 endif
@@ -75,15 +69,9 @@ if USE_TESTS
 noinst_PROGRAMS += tests
 tests_SOURCES = src/tests.c
 tests_CPPFLAGS = -DVERIFY $(SECP_INCLUDES) $(SECP_TEST_INCLUDES)
-tests_LDADD = $(COMMON_LIB) $(SECP_LIBS) $(SECP_TEST_LIBS)
+tests_LDADD = $(SECP_LIBS) $(SECP_TEST_LIBS)
 tests_LDFLAGS = -static
 TESTS = tests
 endif
 
-EXTRA_DIST = autogen.sh nasm_lt.sh
-
-#x86_64 only
-if USE_ASM
-.asm.lo:
-	$(LIBTOOL) --mode=compile --tag YASM $(srcdir)/nasm_lt.sh $(YASM) -f $(YASM_BINFMT) $(YAFLAGS) -I$(srcdir) -I. $< -o $@
-endif
+EXTRA_DIST = autogen.sh
diff --git a/build-aux/m4/bitcoin_secp.m4 b/build-aux/m4/bitcoin_secp.m4
index 4ca28f99c..1373478c9 100644
--- a/build-aux/m4/bitcoin_secp.m4
+++ b/build-aux/m4/bitcoin_secp.m4
@@ -11,38 +11,16 @@ fi
 
 dnl 
 AC_DEFUN([SECP_64BIT_ASM_CHECK],[
-if test x"$host_cpu" == x"x86_64"; then
-  AC_CHECK_PROG(YASM, yasm, yasm)
-else
-  if test x"$set_field" = x"64bit_asm"; then
-    AC_MSG_ERROR([$set_field field support explicitly requested but is not compatible with this host])
-  fi
-fi
-if test x$YASM = x; then
-  if test x"$set_field" = x"64bit_asm"; then
-    AC_MSG_ERROR([$set_field field support explicitly requested but yasm was not found])
-  fi
-  has_64bit_asm=no
-else
-  case x"$host_os" in
-  xdarwin*)
-    YASM_BINFMT=macho64
-    ;;
-  x*-gnux32)
-    YASM_BINFMT=elfx32
-    ;;
-  *)
-    YASM_BINFMT=elf64
-    ;;
-  esac
-  if $YASM -f help | grep -q $YASM_BINFMT; then
-    has_64bit_asm=yes
-  else
-    if test x"$set_field" = x"64bit_asm"; then
-      AC_MSG_ERROR([$set_field field support explicitly requested but yasm doesn't support $YASM_BINFMT format])
-    fi
-    AC_MSG_WARN([yasm too old for $YASM_BINFMT format])
-    has_64bit_asm=no
+AC_MSG_CHECKING(for x86_64 assembly availability)
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+  #include <stdint.h>]],[[
+  uint64_t a = 11, tmp;
+  __asm__ __volatile__("movq $0x100000000,%1; mulq %%rsi" : "+a"(a) : "S"(tmp) : "cc", "%rdx");
+  ]])],[has_64bit_asm=yes],[has_64bit_asm=no])
+AC_MSG_RESULT([$has_64bit_asm])
+if test x"$set_field" == x"64bit_asm"; then
+  if test x"$has_64bit_asm" == x"no"; then
+    AC_MSG_ERROR([$set_field field support explicitly requested but no x86_64 assembly available])
   fi
 fi
 ])
@@ -52,8 +30,13 @@ AC_DEFUN([SECP_OPENSSL_CHECK],[
 if test x"$use_pkgconfig" = x"yes"; then
     : #NOP
   m4_ifdef([PKG_CHECK_MODULES],[
-    PKG_CHECK_MODULES([CRYPTO], [libcrypto], [has_libcrypto=yes; AC_DEFINE(HAVE_LIBCRYPTO,1,[Define this symbol if libcrypto is installed])],[has_libcrypto=no])
-    : #NOP
+    PKG_CHECK_MODULES([CRYPTO], [libcrypto], [has_libcrypto=yes],[has_libcrypto=no])
+    if test x"$has_libcrypto" = x"yes"; then
+      TEMP_LIBS="$LIBS"
+      LIBS="$LIBS $CRYPTO_LIBS"
+      AC_CHECK_LIB(crypto, main,[AC_DEFINE(HAVE_LIBCRYPTO,1,[Define this symbol if libcrypto is installed])],[has_libcrypto=no])
+      LIBS="$TEMP_LIBS"
+    fi
   ])
 else
   AC_CHECK_HEADER(openssl/crypto.h,[AC_CHECK_LIB(crypto, main,[has_libcrypto=yes; CRYPTO_LIBS=-lcrypto; AC_DEFINE(HAVE_LIBCRYPTO,1,[Define this symbol if libcrypto is installed])]
diff --git a/configure.ac b/configure.ac
index 6e6fccd7f..40e121e80 100644
--- a/configure.ac
+++ b/configure.ac
@@ -18,6 +18,10 @@ AC_PATH_TOOL(AR, ar)
 AC_PATH_TOOL(RANLIB, ranlib)
 AC_PATH_TOOL(STRIP, strip)
 
+if test "x$CFLAGS" = "x"; then
+  CFLAGS="-O3 -g"
+fi
+
 AC_PROG_CC_C99
 if test x"$ac_cv_prog_cc_c99" == x"no"; then
   AC_MSG_ERROR([c99 compiler support required])
@@ -103,7 +107,11 @@ AC_ARG_WITH([scalar], [AS_HELP_STRING([--with-scalar=64bit|32bit|auto],
 
 AC_CHECK_TYPES([__int128])
 
-AC_CHECK_DECL(__builtin_expect,AC_DEFINE(HAVE_BUILTIN_EXPECT,1,[Define this symbol if __builtin_expect is available]),,)
+AC_MSG_CHECKING([for __builtin_expect])
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[void myfunc() {__builtin_expect(0,0);}]])],
+    [ AC_MSG_RESULT([yes]);AC_DEFINE(HAVE_BUILTIN_EXPECT,1,[Define this symbol if __builtin_expect is available]) ],
+    [ AC_MSG_RESULT([no])
+    ])
 
 if test x"$req_field" = x"auto"; then
   SECP_64BIT_ASM_CHECK
@@ -283,7 +291,6 @@ AC_SUBST(SECP_INCLUDES)
 AC_SUBST(SECP_LIBS)
 AC_SUBST(SECP_TEST_LIBS)
 AC_SUBST(SECP_TEST_INCLUDES)
-AC_SUBST(YASM_BINFMT)
 AM_CONDITIONAL([USE_ASM], [test x"$set_field" == x"64bit_asm"])
 AM_CONDITIONAL([USE_TESTS], [test x"$use_tests" != x"no"])
 AM_CONDITIONAL([USE_BENCHMARK], [test x"$use_benchmark" != x"no"])
diff --git a/include/secp256k1.h b/include/secp256k1.h
index 94a6ef483..dca7ca00e 100644
--- a/include/secp256k1.h
+++ b/include/secp256k1.h
@@ -62,8 +62,7 @@ void secp256k1_stop(void);
  *           0: incorrect signature
  *          -1: invalid public key
  *          -2: invalid signature
- * In:       msg:       the message being verified (cannot be NULL)
- *           msglen:    the length of the message (at most 32)
+ * In:       msg32:     the 32-byte message hash being verified (cannot be NULL)
  *           sig:       the signature being verified (cannot be NULL)
  *           siglen:    the length of the signature
  *           pubkey:    the public key to verify with (cannot be NULL)
@@ -71,19 +70,17 @@ void secp256k1_stop(void);
  * Requires starting using SECP256K1_START_VERIFY.
  */
 SECP256K1_WARN_UNUSED_RESULT int secp256k1_ecdsa_verify(
-  const unsigned char *msg,
-  int msglen,
+  const unsigned char *msg32,
   const unsigned char *sig,
   int siglen,
   const unsigned char *pubkey,
   int pubkeylen
-) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(3) SECP256K1_ARG_NONNULL(5);
+) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(2) SECP256K1_ARG_NONNULL(4);
 
 /** Create an ECDSA signature.
  *  Returns: 1: signature created
  *           0: nonce invalid, try another one
- *  In:      msg:    the message being signed (cannot be NULL)
- *           msglen: the length of the message being signed (at most 32)
+ *  In:      msg32:  the 32-byte message hash being signed (cannot be NULL)
  *           seckey: pointer to a 32-byte secret key (cannot be NULL, assumed to be valid)
  *           nonce:  pointer to a 32-byte nonce (cannot be NULL, generated with a cryptographic PRNG)
  *  Out:     sig:    pointer to an array where the signature will be placed (cannot be NULL)
@@ -92,19 +89,17 @@ SECP256K1_WARN_UNUSED_RESULT int secp256k1_ecdsa_verify(
  * Requires starting using SECP256K1_START_SIGN.
  */
 SECP256K1_WARN_UNUSED_RESULT int secp256k1_ecdsa_sign(
-  const unsigned char *msg,
-  int msglen,
+  const unsigned char *msg32,
   unsigned char *sig,
   int *siglen,
   const unsigned char *seckey,
   const unsigned char *nonce
-) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(3) SECP256K1_ARG_NONNULL(4) SECP256K1_ARG_NONNULL(5) SECP256K1_ARG_NONNULL(6);
+) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(2) SECP256K1_ARG_NONNULL(3) SECP256K1_ARG_NONNULL(4) SECP256K1_ARG_NONNULL(5);
 
 /** Create a compact ECDSA signature (64 byte + recovery id).
  *  Returns: 1: signature created
  *           0: nonce invalid, try another one
- *  In:      msg:    the message being signed (cannot be NULL)
- *           msglen: the length of the message being signed (at most 32)
+ *  In:      msg32:  the 32-byte message hash being signed (cannot be NULL)
  *           seckey: pointer to a 32-byte secret key (cannot be NULL, assumed to be valid)
  *           nonce:  pointer to a 32-byte nonce (cannot be NULL, generated with a cryptographic PRNG)
  *  Out:     sig:    pointer to a 64-byte array where the signature will be placed (cannot be NULL)
@@ -112,19 +107,17 @@ SECP256K1_WARN_UNUSED_RESULT int secp256k1_ecdsa_sign(
  * Requires starting using SECP256K1_START_SIGN.
  */
 SECP256K1_WARN_UNUSED_RESULT int secp256k1_ecdsa_sign_compact(
-  const unsigned char *msg,
-  int msglen,
+  const unsigned char *msg32,
   unsigned char *sig64,
   const unsigned char *seckey,
   const unsigned char *nonce,
   int *recid
-) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(3) SECP256K1_ARG_NONNULL(4) SECP256K1_ARG_NONNULL(5);
+) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(2) SECP256K1_ARG_NONNULL(3) SECP256K1_ARG_NONNULL(4);
 
 /** Recover an ECDSA public key from a compact signature.
  *  Returns: 1: public key successfully recovered (which guarantees a correct signature).
  *           0: otherwise.
- *  In:      msg:        the message assumed to be signed (cannot be NULL)
- *           msglen:     the length of the message (at most 32)
+ *  In:      msg32:      the 32-byte message hash assumed to be signed (cannot be NULL)
  *           sig64:      signature as 64 byte array (cannot be NULL)
  *           compressed: whether to recover a compressed or uncompressed pubkey
  *           recid:      the recovery id (0-3, as returned by ecdsa_sign_compact)
@@ -133,14 +126,13 @@ SECP256K1_WARN_UNUSED_RESULT int secp256k1_ecdsa_sign_compact(
  * Requires starting using SECP256K1_START_VERIFY.
  */
 SECP256K1_WARN_UNUSED_RESULT int secp256k1_ecdsa_recover_compact(
-  const unsigned char *msg,
-  int msglen,
+  const unsigned char *msg32,
   const unsigned char *sig64,
   unsigned char *pubkey,
   int *pubkeylen,
   int compressed,
   int recid
-) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(3) SECP256K1_ARG_NONNULL(4) SECP256K1_ARG_NONNULL(5);
+) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(2) SECP256K1_ARG_NONNULL(3) SECP256K1_ARG_NONNULL(4);
 
 /** Verify an ECDSA secret key.
  *  Returns: 1: secret key is valid
diff --git a/nasm_lt.sh b/nasm_lt.sh
deleted file mode 100755
index 6cd73294c..000000000
--- a/nasm_lt.sh
+++ /dev/null
@@ -1,57 +0,0 @@
-#! /bin/sh
-command=""
-infile=""
-o_opt=no
-pic=no
-while [ $# -gt 0 ]; do
-    case "$1" in
-        -DPIC|-fPIC|-fpic|-Kpic|-KPIC)
-            if [ "$pic" != "yes" ] ; then
-                command="$command -DPIC"
-                pic=yes
-            fi
-            ;;
-        -f|-fbin|-faout|-faoutb|-fcoff|-felf|-felf64|-fas86| \
-        -fobj|-fwin32|-fwin64|-frdf|-fieee|-fmacho|-fmacho64)
-            # it's a file format specifier for nasm.
-            command="$command $1"
-            ;;
-        -f*)
-            # maybe a code-generation flag for gcc.
-            ;;
-        -[Ii]*)
-            incdir=`echo "$1" | sed 's/^-[Ii]//'`
-            if [ "x$incdir" = x -a "x$2" != x ] ; then
-                case "$2" in
-                    -*) ;;
-                    *) incdir="$2"; shift;;
-                esac
-            fi
-            if [ "x$incdir" != x ] ; then
-                # In the case of NASM, the trailing slash is necessary.
-                incdir=`echo "$incdir" | sed 's%/*$%/%'`
-                command="$command -I$incdir"
-            fi
-            ;;
-        -o*)
-            o_opt=yes
-            command="$command $1"
-            ;;
-        *.asm)
-            infile=$1
-            command="$command $1"
-            ;;
-        *)
-            command="$command $1"
-            ;;
-    esac
-    shift
-done
-if [ "$o_opt" != yes ] ; then
-    # By default, NASM creates an output file
-    # in the same directory as the input file.
-    outfile="-o `echo $infile | sed -e 's%^.*/%%' -e 's%\.[^.]*$%%'`.o"
-    command="$command $outfile"
-fi
-echo $command
-exec $command
diff --git a/src/bench.h b/src/bench.h
new file mode 100644
index 000000000..668ec39f7
--- /dev/null
+++ b/src/bench.h
@@ -0,0 +1,37 @@
+/**********************************************************************
+ * Copyright (c) 2014 Pieter Wuille                                   *
+ * Distributed under the MIT software license, see the accompanying   *
+ * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
+ **********************************************************************/
+
+#ifndef _SECP256K1_BENCH_H_
+#define _SECP256K1_BENCH_H_
+
+#include <stdio.h>
+#include <math.h>
+#include "sys/time.h"
+
+static double gettimedouble(void) {
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return tv.tv_usec * 0.000001 + tv.tv_sec;
+}
+
+void run_benchmark(void (*benchmark)(void*), void (*setup)(void*), void (*teardown)(void*), void* data, int count, int iter) {
+    double min = HUGE_VAL;
+    double sum = 0.0;
+    double max = 0.0;
+    for (int i = 0; i < count; i++) {
+        if (setup) setup(data);
+        double begin = gettimedouble();
+        benchmark(data);
+        double total = gettimedouble() - begin;
+        if (teardown) teardown(data);
+        if (total < min) min = total;
+        if (total > max) max = total;
+        sum += total;
+    }
+    printf("min %.3fus / avg %.3fus / max %.3fus\n", min * 1000000.0 / iter, (sum / count) * 1000000.0 / iter, max * 1000000.0 / iter);
+}
+
+#endif
diff --git a/src/bench_inv.c b/src/bench_inv.c
index d6f664333..3bdedea30 100644
--- a/src/bench_inv.c
+++ b/src/bench_inv.c
@@ -12,30 +12,41 @@
 #include "field_impl.h"
 #include "group_impl.h"
 #include "scalar_impl.h"
+#include "bench.h"
+
+typedef struct {
+    secp256k1_scalar_t base, x;
+} bench_inv_t;
+
+void bench_inv_setup(void* arg) {
+    bench_inv_t *data = (bench_inv_t*)arg;
 
-int main(void) {
     static const unsigned char init[32] = {
         0x02, 0x03, 0x05, 0x07, 0x0b, 0x0d, 0x11, 0x13,
         0x17, 0x1d, 0x1f, 0x25, 0x29, 0x2b, 0x2f, 0x35,
         0x3b, 0x3d, 0x43, 0x47, 0x49, 0x4f, 0x53, 0x59,
         0x61, 0x65, 0x67, 0x6b, 0x6d, 0x71, 0x7f, 0x83
     };
-    static const unsigned char fini[32] = {
-        0xba, 0x28, 0x58, 0xd8, 0xaa, 0x11, 0xd6, 0xf2,
-        0xfa, 0xce, 0x50, 0xb1, 0x67, 0x19, 0xb1, 0xa6,
-        0xe0, 0xaa, 0x84, 0x53, 0xf6, 0x80, 0xfc, 0x23,
-        0x88, 0x3c, 0xd6, 0x74, 0x9f, 0x27, 0x09, 0x03
-    };
-    secp256k1_ge_start();
-    secp256k1_scalar_t base, x;
-    secp256k1_scalar_set_b32(&base, init, NULL);
-    secp256k1_scalar_set_b32(&x, init, NULL);
-    for (int i=0; i<1000000; i++) {
-        secp256k1_scalar_inverse(&x, &x);
-        secp256k1_scalar_add(&x, &x, &base);
+
+    secp256k1_scalar_set_b32(&data->base, init, NULL);
+    secp256k1_scalar_set_b32(&data->x, init, NULL);
+}
+
+void bench_inv(void* arg) {
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (int i=0; i<20000; i++) {
+        secp256k1_scalar_inverse(&data->x, &data->x);
+        secp256k1_scalar_add(&data->x, &data->x, &data->base);
     }
-    unsigned char res[32];
-    secp256k1_scalar_get_b32(res, &x);
-    CHECK(memcmp(res, fini, 32) == 0);
+}
+
+int main(void) {
+    secp256k1_ge_start();
+
+    bench_inv_t data;
+    run_benchmark(bench_inv, bench_inv_setup, NULL, &data, 10, 20000);
+
+    secp256k1_ge_stop();
     return 0;
 }
diff --git a/src/bench_recover.c b/src/bench_recover.c
new file mode 100644
index 000000000..b1e0f33ef
--- /dev/null
+++ b/src/bench_recover.c
@@ -0,0 +1,46 @@
+/**********************************************************************
+ * Copyright (c) 2014 Pieter Wuille                                   *
+ * Distributed under the MIT software license, see the accompanying   *
+ * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
+ **********************************************************************/
+
+#include "include/secp256k1.h"
+#include "util.h"
+#include "bench.h"
+
+typedef struct {
+    unsigned char msg[32];
+    unsigned char sig[64];
+} bench_recover_t;
+
+void bench_recover(void* arg) {
+    bench_recover_t *data = (bench_recover_t*)arg;
+
+    unsigned char pubkey[33];
+    for (int i=0; i<20000; i++) {
+        int pubkeylen = 33;
+        CHECK(secp256k1_ecdsa_recover_compact(data->msg, data->sig, pubkey, &pubkeylen, 1, i % 2));
+        for (int j = 0; j < 32; j++) {
+            data->sig[j + 32] = data->msg[j];    /* Move former message to S. */
+            data->msg[j] = data->sig[j];         /* Move former R to message. */
+            data->sig[j] = pubkey[j + 1];        /* Move recovered pubkey X coordinate to R (which must be a valid X coordinate). */
+        }
+    }
+}
+
+void bench_recover_setup(void* arg) {
+    bench_recover_t *data = (bench_recover_t*)arg;
+
+    for (int i = 0; i < 32; i++) data->msg[i] = 1 + i;
+    for (int i = 0; i < 64; i++) data->sig[i] = 65 + i;
+}
+
+int main(void) {
+    secp256k1_start(SECP256K1_START_VERIFY);
+
+    bench_recover_t data;
+    run_benchmark(bench_recover, bench_recover_setup, NULL, &data, 10, 20000);
+
+    secp256k1_stop();
+    return 0;
+}
diff --git a/src/bench_sign.c b/src/bench_sign.c
index f01f11d68..66e71e1ac 100644
--- a/src/bench_sign.c
+++ b/src/bench_sign.c
@@ -3,46 +3,45 @@
  * Distributed under the MIT software license, see the accompanying   *
  * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
  **********************************************************************/
-#include <stdio.h>
-#include <string.h>
 
 #include "include/secp256k1.h"
 #include "util.h"
+#include "bench.h"
 
-int main(void) {
-    secp256k1_start(SECP256K1_START_SIGN);
-
+typedef struct {
     unsigned char msg[32];
     unsigned char nonce[32];
     unsigned char key[32];
+} bench_sign_t;
 
-    for (int i = 0; i < 32; i++) msg[i] = i + 1;
-    for (int i = 0; i < 32; i++) nonce[i] = i + 33;
-    for (int i = 0; i < 32; i++) key[i] = i + 65;
+static void bench_sign_setup(void* arg) {
+    bench_sign_t *data = (bench_sign_t*)arg;
 
-    unsigned char sig[64];
+    for (int i = 0; i < 32; i++) data->msg[i] = i + 1;
+    for (int i = 0; i < 32; i++) data->nonce[i] = i + 33;
+    for (int i = 0; i < 32; i++) data->key[i] = i + 65;
+}
+
+static void bench_sign(void* arg) {
+    bench_sign_t *data = (bench_sign_t*)arg;
 
-    for (int i=0; i<1000000; i++) {
+    unsigned char sig[64];
+    for (int i=0; i<20000; i++) {
         int recid = 0;
-        CHECK(secp256k1_ecdsa_sign_compact(msg, 32, sig, key, nonce, &recid));
+        CHECK(secp256k1_ecdsa_sign_compact(data->msg, sig, data->key, data->nonce, &recid));
         for (int j = 0; j < 32; j++) {
-            nonce[j] = key[j];     /* Move former key to nonce  */
-            msg[j] = sig[j];       /* Move former R to message. */
-            key[j] = sig[j + 32];  /* Move former S to key.     */
+            data->nonce[j] = data->key[j];     /* Move former key to nonce  */
+            data->msg[j] = sig[j];             /* Move former R to message. */
+            data->key[j] = sig[j + 32];        /* Move former S to key.     */
         }
     }
+}
+
+int main(void) {
+    secp256k1_start(SECP256K1_START_SIGN);
 
-    static const unsigned char fini[64] = {
-        0x92, 0x03, 0xef, 0xf1, 0x58, 0x0b, 0x49, 0x8d,
-        0x22, 0x3d, 0x49, 0x0e, 0xbf, 0x26, 0x50, 0x0e,
-        0x2d, 0x62, 0x90, 0xd7, 0x82, 0xbd, 0x3d, 0x5c,
-        0xa9, 0x10, 0xa5, 0x49, 0xb1, 0xd8, 0x8c, 0xc0,
-        0x5b, 0x5e, 0x9e, 0x68, 0x51, 0x3d, 0xe8, 0xec,
-        0x82, 0x30, 0x82, 0x88, 0x8c, 0xfd, 0xe7, 0x71,
-        0x15, 0x92, 0xfc, 0x14, 0x59, 0x78, 0x31, 0xb3,
-        0xf6, 0x07, 0x91, 0x18, 0x00, 0x8d, 0x4c, 0xb2
-    };
-    CHECK(memcmp(sig, fini, 64) == 0);
+    bench_sign_t data;
+    run_benchmark(bench_sign, bench_sign_setup, NULL, &data, 10, 20000);
 
     secp256k1_stop();
     return 0;
diff --git a/src/bench_verify.c b/src/bench_verify.c
index 690595516..b123c4087 100644
--- a/src/bench_verify.c
+++ b/src/bench_verify.c
@@ -9,35 +9,46 @@
 
 #include "include/secp256k1.h"
 #include "util.h"
+#include "bench.h"
 
-int main(void) {
-    secp256k1_start(SECP256K1_START_VERIFY);
-
+typedef struct {
     unsigned char msg[32];
-    unsigned char sig[64];
-
-    for (int i = 0; i < 32; i++) msg[i] = 1 + i;
-    for (int i = 0; i < 64; i++) sig[i] = 65 + i;
-
+    unsigned char key[32];
+    unsigned char nonce[32];
+    unsigned char sig[72];
+    int siglen;
     unsigned char pubkey[33];
-    for (int i=0; i<1000000; i++) {
-        int pubkeylen = 33;
-        CHECK(secp256k1_ecdsa_recover_compact(msg, 32, sig, pubkey, &pubkeylen, 1, i % 2));
-        for (int j = 0; j < 32; j++) {
-            sig[j + 32] = msg[j];    /* Move former message to S. */
-            msg[j] = sig[j];         /* Move former R to message. */
-            sig[j] = pubkey[j + 1];  /* Move recovered pubkey X coordinate to R (which must be a valid X coordinate). */
-        }
+    int pubkeylen;
+} benchmark_verify_t;
+
+static void benchmark_verify(void* arg) {
+    benchmark_verify_t* data = (benchmark_verify_t*)arg;
+
+    for (int i=0; i<20000; i++) {
+        data->sig[data->siglen - 1] ^= (i & 0xFF);
+        data->sig[data->siglen - 2] ^= ((i >> 8) & 0xFF);
+        data->sig[data->siglen - 3] ^= ((i >> 16) & 0xFF);
+        CHECK(secp256k1_ecdsa_verify(data->msg, data->sig, data->siglen, data->pubkey, data->pubkeylen) == (i == 0));
+        data->sig[data->siglen - 1] ^= (i & 0xFF);
+        data->sig[data->siglen - 2] ^= ((i >> 8) & 0xFF);
+        data->sig[data->siglen - 3] ^= ((i >> 16) & 0xFF);
     }
+}
+
+int main(void) {
+    secp256k1_start(SECP256K1_START_VERIFY | SECP256K1_START_SIGN);
+
+    benchmark_verify_t data;
+
+    for (int i = 0; i < 32; i++) data.msg[i] = 1 + i;
+    for (int i = 0; i < 32; i++) data.key[i] = 33 + i;
+    for (int i = 0; i < 32; i++) data.nonce[i] = 65 + i;
+    data.siglen = 72;
+    CHECK(secp256k1_ecdsa_sign(data.msg, data.sig, &data.siglen, data.key, data.nonce));
+    data.pubkeylen = 33;
+    CHECK(secp256k1_ec_pubkey_create(data.pubkey, &data.pubkeylen, data.key, 1));
 
-    static const unsigned char fini[33] = {
-        0x02,
-        0x52, 0x63, 0xae, 0x9a, 0x9d, 0x47, 0x1f, 0x1a,
-        0xb2, 0x36, 0x65, 0x89, 0x11, 0xe7, 0xcc, 0x86,
-        0xa3, 0xab, 0x97, 0xb6, 0xf1, 0xaf, 0xfd, 0x8f,
-        0x9b, 0x38, 0xb6, 0x18, 0x55, 0xe5, 0xc2, 0x43
-    };
-    CHECK(memcmp(fini, pubkey, 33) == 0);
+    run_benchmark(benchmark_verify, NULL, NULL, &data, 10, 20000);
 
     secp256k1_stop();
     return 0;
diff --git a/src/ecdsa_impl.h b/src/ecdsa_impl.h
index a951d0b4a..8825d05fe 100644
--- a/src/ecdsa_impl.h
+++ b/src/ecdsa_impl.h
@@ -27,7 +27,7 @@ static void secp256k1_ecdsa_start(void) {
         return;
 
     /* Allocate. */
-    secp256k1_ecdsa_consts_t *ret = (secp256k1_ecdsa_consts_t*)malloc(sizeof(secp256k1_ecdsa_consts_t));
+    secp256k1_ecdsa_consts_t *ret = (secp256k1_ecdsa_consts_t*)checked_malloc(sizeof(secp256k1_ecdsa_consts_t));
 
     static const unsigned char order[] = {
         0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
@@ -38,7 +38,7 @@ static void secp256k1_ecdsa_start(void) {
 
     secp256k1_fe_set_b32(&ret->order_as_fe, order);
     secp256k1_fe_negate(&ret->p_minus_order, &ret->order_as_fe, 1);
-    secp256k1_fe_normalize(&ret->p_minus_order);
+    secp256k1_fe_normalize_var(&ret->p_minus_order);
 
     /* Set the global pointer. */
     secp256k1_ecdsa_consts = ret;
@@ -122,7 +122,7 @@ static int secp256k1_ecdsa_sig_recompute(secp256k1_scalar_t *r2, const secp256k1
     secp256k1_gej_t pr; secp256k1_ecmult(&pr, &pubkeyj, &u2, &u1);
     if (!secp256k1_gej_is_infinity(&pr)) {
         secp256k1_fe_t xr; secp256k1_gej_get_x_var(&xr, &pr);
-        secp256k1_fe_normalize(&xr);
+        secp256k1_fe_normalize_var(&xr);
         unsigned char xrb[32]; secp256k1_fe_get_b32(xrb, &xr);
         secp256k1_scalar_set_b32(r2, xrb, NULL);
         ret = 1;
@@ -144,7 +144,7 @@ static int secp256k1_ecdsa_sig_recover(const secp256k1_ecdsa_sig_t *sig, secp256
         secp256k1_fe_add(&fx, &secp256k1_ecdsa_consts->order_as_fe);
     }
     secp256k1_ge_t x;
-    if (!secp256k1_ge_set_xo(&x, &fx, recid & 1))
+    if (!secp256k1_ge_set_xo_var(&x, &fx, recid & 1))
         return 0;
     secp256k1_gej_t xj;
     secp256k1_gej_set_ge(&xj, &x);
diff --git a/src/eckey_impl.h b/src/eckey_impl.h
index 0f218ced9..b3fa7d9bd 100644
--- a/src/eckey_impl.h
+++ b/src/eckey_impl.h
@@ -17,7 +17,7 @@
 static int secp256k1_eckey_pubkey_parse(secp256k1_ge_t *elem, const unsigned char *pub, int size) {
     if (size == 33 && (pub[0] == 0x02 || pub[0] == 0x03)) {
         secp256k1_fe_t x;
-        return secp256k1_fe_set_b32(&x, pub+1) && secp256k1_ge_set_xo(elem, &x, pub[0] == 0x03);
+        return secp256k1_fe_set_b32(&x, pub+1) && secp256k1_ge_set_xo_var(elem, &x, pub[0] == 0x03);
     } else if (size == 65 && (pub[0] == 0x04 || pub[0] == 0x06 || pub[0] == 0x07)) {
         secp256k1_fe_t x, y;
         if (!secp256k1_fe_set_b32(&x, pub+1) || !secp256k1_fe_set_b32(&y, pub+33)) {
@@ -26,7 +26,7 @@ static int secp256k1_eckey_pubkey_parse(secp256k1_ge_t *elem, const unsigned cha
         secp256k1_ge_set_xy(elem, &x, &y);
         if ((pub[0] == 0x06 || pub[0] == 0x07) && secp256k1_fe_is_odd(&y) != (pub[0] == 0x07))
             return 0;
-        return secp256k1_ge_is_valid(elem);
+        return secp256k1_ge_is_valid_var(elem);
     } else {
         return 0;
     }
@@ -36,8 +36,8 @@ static int secp256k1_eckey_pubkey_serialize(secp256k1_ge_t *elem, unsigned char
     if (secp256k1_ge_is_infinity(elem)) {
         return 0;
     }
-    secp256k1_fe_normalize(&elem->x);
-    secp256k1_fe_normalize(&elem->y);
+    secp256k1_fe_normalize_var(&elem->x);
+    secp256k1_fe_normalize_var(&elem->y);
     secp256k1_fe_get_b32(&pub[1], &elem->x);
     if (compressed) {
         *size = 33;
diff --git a/src/ecmult_gen_impl.h b/src/ecmult_gen_impl.h
index af0ead522..5a5b16ce1 100644
--- a/src/ecmult_gen_impl.h
+++ b/src/ecmult_gen_impl.h
@@ -34,7 +34,7 @@ static void secp256k1_ecmult_gen_start(void) {
         return;
 
     /* Allocate the precomputation table. */
-    secp256k1_ecmult_gen_consts_t *ret = (secp256k1_ecmult_gen_consts_t*)malloc(sizeof(secp256k1_ecmult_gen_consts_t));
+    secp256k1_ecmult_gen_consts_t *ret = (secp256k1_ecmult_gen_consts_t*)checked_malloc(sizeof(secp256k1_ecmult_gen_consts_t));
 
     /* get the generator */
     const secp256k1_ge_t *g = &secp256k1_ge_consts->g;
@@ -47,7 +47,7 @@ static void secp256k1_ecmult_gen_start(void) {
         secp256k1_fe_t nums_x;
         VERIFY_CHECK(secp256k1_fe_set_b32(&nums_x, nums_b32));
         secp256k1_ge_t nums_ge;
-        VERIFY_CHECK(secp256k1_ge_set_xo(&nums_ge, &nums_x, 0));
+        VERIFY_CHECK(secp256k1_ge_set_xo_var(&nums_ge, &nums_x, 0));
         secp256k1_gej_set_ge(&nums_gej, &nums_ge);
         /* Add G to make the bits in x uniformly distributed. */
         secp256k1_gej_add_ge_var(&nums_gej, &nums_gej, g);
@@ -73,7 +73,7 @@ static void secp256k1_ecmult_gen_start(void) {
             secp256k1_gej_double_var(&numsbase, &numsbase);
             if (j == 62) {
                 /* In the last iteration, numsbase is (1 - 2^j) * nums instead. */
-                secp256k1_gej_neg(&numsbase, &numsbase);
+                secp256k1_gej_neg_var(&numsbase, &numsbase);
                 secp256k1_gej_add_var(&numsbase, &numsbase, &nums_gej);
             }
         }
diff --git a/src/ecmult_impl.h b/src/ecmult_impl.h
index 445b81593..653677104 100644
--- a/src/ecmult_impl.h
+++ b/src/ecmult_impl.h
@@ -15,11 +15,13 @@
 #define WINDOW_A 5
 
 /** larger numbers may result in slightly better performance, at the cost of
-    exponentially larger precomputed tables. WINDOW_G == 14 results in 640 KiB. */
+    exponentially larger precomputed tables. */
 #ifdef USE_ENDOMORPHISM
-#define WINDOW_G 14
-#else
+/** Two tables for window size 15: 1.375 MiB. */
 #define WINDOW_G 15
+#else
+/** One table for window size 16: 1.375 MiB. */
+#define WINDOW_G 16
 #endif
 
 /** Fill a table 'pre' with precomputed odd multiples of a. W determines the size of the table.
@@ -43,13 +45,14 @@ static void secp256k1_ecmult_table_precomp_gej_var(secp256k1_gej_t *pre, const s
 
 static void secp256k1_ecmult_table_precomp_ge_var(secp256k1_ge_t *pre, const secp256k1_gej_t *a, int w) {
     const int table_size = 1 << (w-2);
-    secp256k1_gej_t prej[table_size];
+    secp256k1_gej_t *prej = checked_malloc(sizeof(secp256k1_gej_t) * table_size);
     prej[0] = *a;
     secp256k1_gej_t d; secp256k1_gej_double_var(&d, a);
     for (int i=1; i<table_size; i++) {
         secp256k1_gej_add_var(&prej[i], &d, &prej[i-1]);
     }
     secp256k1_ge_set_all_gej_var(table_size, pre, prej);
+    free(prej);
 }
 
 /** The number of entries a table with precomputed multiples needs to have. */
@@ -67,8 +70,8 @@ static void secp256k1_ecmult_table_precomp_ge_var(secp256k1_ge_t *pre, const sec
         (neg)((r), &(pre)[(-(n)-1)/2]); \
 } while(0)
 
-#define ECMULT_TABLE_GET_GEJ(r,pre,n,w) ECMULT_TABLE_GET((r),(pre),(n),(w),secp256k1_gej_neg)
-#define ECMULT_TABLE_GET_GE(r,pre,n,w)  ECMULT_TABLE_GET((r),(pre),(n),(w),secp256k1_ge_neg)
+#define ECMULT_TABLE_GET_GEJ(r,pre,n,w) ECMULT_TABLE_GET((r),(pre),(n),(w),secp256k1_gej_neg_var)
+#define ECMULT_TABLE_GET_GE(r,pre,n,w)  ECMULT_TABLE_GET((r),(pre),(n),(w),secp256k1_ge_neg_var)
 
 typedef struct {
     /* For accelerating the computation of a*P + b*G: */
@@ -85,7 +88,7 @@ static void secp256k1_ecmult_start(void) {
         return;
 
     /* Allocate the precomputation table. */
-    secp256k1_ecmult_consts_t *ret = (secp256k1_ecmult_consts_t*)malloc(sizeof(secp256k1_ecmult_consts_t));
+    secp256k1_ecmult_consts_t *ret = (secp256k1_ecmult_consts_t*)checked_malloc(sizeof(secp256k1_ecmult_consts_t));
 
     /* get the generator */
     const secp256k1_ge_t *g = &secp256k1_ge_consts->g;
diff --git a/src/field.h b/src/field.h
index 0cdf0fb47..53aa29e13 100644
--- a/src/field.h
+++ b/src/field.h
@@ -50,6 +50,9 @@ static void secp256k1_fe_stop(void);
 /** Normalize a field element. */
 static void secp256k1_fe_normalize(secp256k1_fe_t *r);
 
+/** Normalize a field element, without constant-time guarantee. */
+static void secp256k1_fe_normalize_var(secp256k1_fe_t *r);
+
 /** Set a field element equal to a small integer. Resulting field element is normalized. */
 static void secp256k1_fe_set_int(secp256k1_fe_t *r, int a);
 
@@ -93,7 +96,7 @@ static void secp256k1_fe_sqr(secp256k1_fe_t *r, const secp256k1_fe_t *a);
 /** Sets a field element to be the (modular) square root (if any exist) of another. Requires the
  *  input's magnitude to be at most 8. The output magnitude is 1 (but not guaranteed to be
  *  normalized). Return value indicates whether a square root was found. */
-static int secp256k1_fe_sqrt(secp256k1_fe_t *r, const secp256k1_fe_t *a);
+static int secp256k1_fe_sqrt_var(secp256k1_fe_t *r, const secp256k1_fe_t *a);
 
 /** Sets a field element to be the (modular) inverse of another. Requires the input's magnitude to be
  *  at most 8. The output magnitude is 1 (but not guaranteed to be normalized). */
@@ -105,9 +108,6 @@ static void secp256k1_fe_inv_var(secp256k1_fe_t *r, const secp256k1_fe_t *a);
 /** Calculate the (modular) inverses of a batch of field elements. Requires the inputs' magnitudes to be
  *  at most 8. The output magnitudes are 1 (but not guaranteed to be normalized). The inputs and
  *  outputs must not overlap in memory. */
-static void secp256k1_fe_inv_all(size_t len, secp256k1_fe_t r[len], const secp256k1_fe_t a[len]);
-
-/** Potentially faster version of secp256k1_fe_inv_all, without constant-time guarantee. */
 static void secp256k1_fe_inv_all_var(size_t len, secp256k1_fe_t r[len], const secp256k1_fe_t a[len]);
 
 /** Convert a field element to a hexadecimal string. */
diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index c4403fba2..d20229cda 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -103,6 +103,62 @@ static void secp256k1_fe_normalize(secp256k1_fe_t *r) {
 #endif
 }
 
+static void secp256k1_fe_normalize_var(secp256k1_fe_t *r) {
+    uint32_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4],
+             t5 = r->n[5], t6 = r->n[6], t7 = r->n[7], t8 = r->n[8], t9 = r->n[9];
+
+    /* Reduce t9 at the start so there will be at most a single carry from the first pass */
+    uint32_t x = t9 >> 22; t9 &= 0x03FFFFFUL;
+    uint32_t m;
+
+    /* The first pass ensures the magnitude is 1, ... */
+    t0 += x * 0x3D1UL; t1 += (x << 6);
+    t1 += (t0 >> 26); t0 &= 0x3FFFFFFUL;
+    t2 += (t1 >> 26); t1 &= 0x3FFFFFFUL;
+    t3 += (t2 >> 26); t2 &= 0x3FFFFFFUL; m = t2;
+    t4 += (t3 >> 26); t3 &= 0x3FFFFFFUL; m &= t3;
+    t5 += (t4 >> 26); t4 &= 0x3FFFFFFUL; m &= t4;
+    t6 += (t5 >> 26); t5 &= 0x3FFFFFFUL; m &= t5;
+    t7 += (t6 >> 26); t6 &= 0x3FFFFFFUL; m &= t6;
+    t8 += (t7 >> 26); t7 &= 0x3FFFFFFUL; m &= t7;
+    t9 += (t8 >> 26); t8 &= 0x3FFFFFFUL; m &= t8;
+
+    /* ... except for a possible carry at bit 22 of t9 (i.e. bit 256 of the field element) */
+    VERIFY_CHECK(t9 >> 23 == 0);
+
+    /* At most a single final reduction is needed; check if the value is >= the field characteristic */
+    x = (t9 >> 22) | ((t9 == 0x03FFFFFUL) & (m == 0x3FFFFFFUL)
+        & ((t1 + 0x40UL + ((t0 + 0x3D1UL) >> 26)) > 0x3FFFFFFUL));
+
+    if (x) {
+        t0 += 0x3D1UL; t1 += (x << 6);
+        t1 += (t0 >> 26); t0 &= 0x3FFFFFFUL;
+        t2 += (t1 >> 26); t1 &= 0x3FFFFFFUL;
+        t3 += (t2 >> 26); t2 &= 0x3FFFFFFUL;
+        t4 += (t3 >> 26); t3 &= 0x3FFFFFFUL;
+        t5 += (t4 >> 26); t4 &= 0x3FFFFFFUL;
+        t6 += (t5 >> 26); t5 &= 0x3FFFFFFUL;
+        t7 += (t6 >> 26); t6 &= 0x3FFFFFFUL;
+        t8 += (t7 >> 26); t7 &= 0x3FFFFFFUL;
+        t9 += (t8 >> 26); t8 &= 0x3FFFFFFUL;
+
+        /* If t9 didn't carry to bit 22 already, then it should have after any final reduction */
+        VERIFY_CHECK(t9 >> 22 == x);
+
+        /* Mask off the possible multiple of 2^256 from the final reduction */
+        t9 &= 0x03FFFFFUL;
+    }
+
+    r->n[0] = t0; r->n[1] = t1; r->n[2] = t2; r->n[3] = t3; r->n[4] = t4;
+    r->n[5] = t5; r->n[6] = t6; r->n[7] = t7; r->n[8] = t8; r->n[9] = t9;
+
+#ifdef VERIFY
+    r->magnitude = 1;
+    r->normalized = 1;
+    secp256k1_fe_verify(r);
+#endif
+}
+
 SECP256K1_INLINE static void secp256k1_fe_set_int(secp256k1_fe_t *r, int a) {
     r->n[0] = a;
     r->n[1] = r->n[2] = r->n[3] = r->n[4] = r->n[5] = r->n[6] = r->n[7] = r->n[8] = r->n[9] = 0;
@@ -271,7 +327,7 @@ SECP256K1_INLINE static void secp256k1_fe_add(secp256k1_fe_t *r, const secp256k1
 #define VERIFY_BITS(x, n) do { } while(0)
 #endif
 
-SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uint32_t * SECP256K1_RESTRICT b, uint32_t *r) {
+SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t *a, const uint32_t * SECP256K1_RESTRICT b) {
     VERIFY_BITS(a[0], 30);
     VERIFY_BITS(a[1], 30);
     VERIFY_BITS(a[2], 30);
@@ -598,7 +654,7 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uin
     /* [r9 r8 r7 r6 r5 r4 r3 r2 r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
 }
 
-SECP256K1_INLINE static void secp256k1_fe_sqr_inner(const uint32_t *a, uint32_t *r) {
+SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t *a) {
     VERIFY_BITS(a[0], 30);
     VERIFY_BITS(a[1], 30);
     VERIFY_BITS(a[2], 30);
@@ -879,7 +935,7 @@ static void secp256k1_fe_mul(secp256k1_fe_t *r, const secp256k1_fe_t *a, const s
     secp256k1_fe_verify(b);
     VERIFY_CHECK(r != b);
 #endif
-    secp256k1_fe_mul_inner(a->n, b->n, r->n);
+    secp256k1_fe_mul_inner(r->n, a->n, b->n);
 #ifdef VERIFY
     r->magnitude = 1;
     r->normalized = 0;
@@ -892,7 +948,7 @@ static void secp256k1_fe_sqr(secp256k1_fe_t *r, const secp256k1_fe_t *a) {
     VERIFY_CHECK(a->magnitude <= 8);
     secp256k1_fe_verify(a);
 #endif
-    secp256k1_fe_sqr_inner(a->n, r->n);
+    secp256k1_fe_sqr_inner(r->n, a->n);
 #ifdef VERIFY
     r->magnitude = 1;
     r->normalized = 0;
diff --git a/src/field_5x52_asm.asm b/src/field_5x52_asm.asm
deleted file mode 100644
index 5e785f763..000000000
--- a/src/field_5x52_asm.asm
+++ /dev/null
@@ -1,469 +0,0 @@
-	;; Added by Diederik Huys, March 2013
-	;;
-	;; Provided public procedures:
-	;; 	secp256k1_fe_mul_inner
-	;; 	secp256k1_fe_sqr_inner
-	;;
-	;; Needed tools: YASM (http://yasm.tortall.net)
-	;;
-	;; 
-
-	BITS 64
-
-%ifidn   __OUTPUT_FORMAT__,macho64
-%define SYM(x) _ %+ x
-%else
-%define SYM(x) x
-%endif
-
-	;;  Procedure ExSetMult
-	;;  Register Layout:
-	;;  INPUT: 	rdi	= a->n
-	;; 	   	rsi  	= b->n
-	;; 	   	rdx  	= r->a
-	;; 
-	;;  INTERNAL:	rdx:rax  = multiplication accumulator
-	;; 		r9:r8    = c
-	;; 		r10-r13  = t0-t3
-	;; 		r14	 = b.n[0] / t4
-	;; 		r15	 = b.n[1] / t5
-	;; 		rbx	 = b.n[2] / t6
-	;; 		rcx	 = b.n[3] / t7
-	;; 		rbp	 = Constant 0FFFFFFFFFFFFFh / t8
-	;; 		rsi	 = b.n / b.n[4] / t9
-
-	GLOBAL SYM(secp256k1_fe_mul_inner)
-	ALIGN 32
-SYM(secp256k1_fe_mul_inner):
-	push rbp
-	push rbx
-	push r12
-	push r13
-	push r14
-	push r15
-	push rdx
-	mov r14,[rsi+8*0]	; preload b.n[0]. This will be the case until
-				; b.n[0] is no longer needed, then we reassign
-				; r14 to t4
-	;; c=a.n[0] * b.n[0]
-   	mov rax,[rdi+0*8]	; load a.n[0]
-	mov rbp,0FFFFFFFFFFFFFh
-	mul r14			; rdx:rax=a.n[0]*b.n[0]
-	mov r15,[rsi+1*8]
-	mov r10,rbp		; load modulus into target register for t0
-	mov r8,rax
-	and r10,rax		; only need lower qword of c
-	shrd r8,rdx,52
-	xor r9,r9		; c < 2^64, so we ditch the HO part 
-
-	;; c+=a.n[0] * b.n[1] + a.n[1] * b.n[0]
-	mov rax,[rdi+0*8]
-	mul r15			
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+1*8]
-	mul r14			
-	mov r11,rbp
-	mov rbx,[rsi+2*8]
-	add r8,rax
-	adc r9,rdx
-	and r11,r8
-	shrd r8,r9,52
-	xor r9,r9
-	
-	;; c+=a.n[0 1 2] * b.n[2 1 0]
-	mov rax,[rdi+0*8]
-	mul rbx			
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+1*8]
-	mul r15			
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+2*8]
-	mul r14
-	mov r12,rbp		
-	mov rcx,[rsi+3*8]
-	add r8,rax
-	adc r9,rdx
-	and r12,r8		
-	shrd r8,r9,52
-	xor r9,r9		
-
-	;; c+=a.n[0 1 2 3] * b.n[3 2 1 0]
-	mov rax,[rdi+0*8]
-	mul rcx			
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+1*8]
-	mul rbx			
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+2*8]
-	mul r15			
-	add r8,rax
-	adc r9,rdx
-	
-	mov rax,[rdi+3*8]
-	mul r14			
-	mov r13,rbp             
-	mov rsi,[rsi+4*8]	; load b.n[4] and destroy pointer
-	add r8,rax
-	adc r9,rdx
-	and r13,r8
-
-	shrd r8,r9,52
-	xor r9,r9		
-
-
-	;; c+=a.n[0 1 2 3 4] * b.n[4 3 2 1 0]
-	mov rax,[rdi+0*8]
-	mul rsi
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+1*8]
-	mul rcx
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+2*8]
-	mul rbx			
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+3*8]
-	mul r15			
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+4*8]
-	mul r14			
-	mov r14,rbp             ; load modulus into t4 and destroy a.n[0]
-	add r8,rax
-	adc r9,rdx
-	and r14,r8
-	shrd r8,r9,52
-	xor r9,r9		
-
-	;; c+=a.n[1 2 3 4] * b.n[4 3 2 1]
-	mov rax,[rdi+1*8]
-	mul rsi
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+2*8]
-	mul rcx
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+3*8]
-	mul rbx
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+4*8]
-	mul r15
-	mov r15,rbp		
-	add r8,rax
-	adc r9,rdx
-
-	and r15,r8
-	shrd r8,r9,52
-	xor r9,r9		
-
-	;; c+=a.n[2 3 4] * b.n[4 3 2]
-	mov rax,[rdi+2*8]
-	mul rsi
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+3*8]
-	mul rcx
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+4*8]
-	mul rbx
-	mov rbx,rbp		
-	add r8,rax
-	adc r9,rdx
-
-	and rbx,r8		
-	shrd r8,r9,52
-	xor r9,r9		
-
-	;; c+=a.n[3 4] * b.n[4 3]
-	mov rax,[rdi+3*8]
-	mul rsi
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+4*8]
-	mul rcx
-	mov rcx,rbp		
-	add r8,rax
-	adc r9,rdx
-	and rcx,r8		
-	shrd r8,r9,52
-	xor r9,r9		
-
-	;; c+=a.n[4] * b.n[4]
-	mov rax,[rdi+4*8]
-	mul rsi
-	;; mov rbp,rbp		; modulus already there!
-	add r8,rax
-	adc r9,rdx
-	and rbp,r8 
-	shrd r8,r9,52
-	xor r9,r9		
-
-	mov rsi,r8		; load c into t9 and destroy b.n[4]
-
-	;; *******************************************************
-common_exit_norm:
-	mov rdi,01000003D10h	; load constant
-
-	mov rax,r15		; get t5
-	mul rdi
-	add rax,r10    		; +t0
-	adc rdx,0
-	mov r10,0FFFFFFFFFFFFFh ; modulus. Sadly, we ran out of registers!
-	mov r8,rax		; +c
-	and r10,rax
-	shrd r8,rdx,52
-	xor r9,r9
-
-	mov rax,rbx		; get t6
-	mul rdi
-	add rax,r11		; +t1
-	adc rdx,0
-	mov r11,0FFFFFFFFFFFFFh ; modulus
-	add r8,rax		; +c
-	adc r9,rdx
-	and r11,r8
-	shrd r8,r9,52
-	xor r9,r9
-
-	mov rax,rcx    		; get t7
-	mul rdi
-	add rax,r12		; +t2
-	adc rdx,0
-	pop rbx			; retrieve pointer to this.n	
-	mov r12,0FFFFFFFFFFFFFh ; modulus
-	add r8,rax		; +c
-	adc r9,rdx
-	and r12,r8
-	mov [rbx+2*8],r12	; mov into this.n[2]
-	shrd r8,r9,52
-	xor r9,r9
-	
-	mov rax,rbp    		; get t8
-	mul rdi
-	add rax,r13    		; +t3
-	adc rdx,0
-	mov r13,0FFFFFFFFFFFFFh ; modulus
-	add r8,rax		; +c
-	adc r9,rdx
-	and r13,r8
-	mov [rbx+3*8],r13	; -> this.n[3]
-	shrd r8,r9,52
-	xor r9,r9
-	
-	mov rax,rsi    		; get t9
-	mul rdi
-	add rax,r14    		; +t4
-	adc rdx,0
-	mov r14,0FFFFFFFFFFFFh	; !!!
-	add r8,rax		; +c
-	adc r9,rdx
-	and r14,r8
-	mov [rbx+4*8],r14	; -> this.n[4]
-	shrd r8,r9,48		; !!!
-	xor r9,r9
-	
-	mov rax,01000003D1h
-	mul r8		
-	add rax,r10
-	adc rdx,0
-	mov r10,0FFFFFFFFFFFFFh ; modulus
-	mov r8,rax
-	and rax,r10
-	shrd r8,rdx,52
-	mov [rbx+0*8],rax	; -> this.n[0]
-	add r8,r11
-	mov [rbx+1*8],r8	; -> this.n[1]
-
-	pop r15
-	pop r14
-	pop r13
-	pop r12
-	pop rbx
-	pop rbp
-	ret
-
-	
-	;;  PROC ExSetSquare
-	;;  Register Layout:
-	;;  INPUT: 	rdi	 = a.n
-	;; 	   	rsi  	 = this.a
-	;;  INTERNAL:	rdx:rax  = multiplication accumulator
-	;; 		r9:r8    = c
-	;; 		r10-r13  = t0-t3
-	;; 		r14	 = a.n[0] / t4
-	;; 		r15	 = a.n[1] / t5
-	;; 		rbx	 = a.n[2] / t6
-	;; 		rcx	 = a.n[3] / t7
-	;; 		rbp	 = 0FFFFFFFFFFFFFh / t8
-	;; 		rsi	 = a.n[4] / t9
-	GLOBAL SYM(secp256k1_fe_sqr_inner)
-	ALIGN 32
-SYM(secp256k1_fe_sqr_inner):
-	push rbp
-	push rbx
-	push r12
-	push r13
-	push r14
-	push r15
-	push rsi
-	mov rbp,0FFFFFFFFFFFFFh
-	
-	;; c=a.n[0] * a.n[0]
-   	mov r14,[rdi+0*8]	; r14=a.n[0]
-	mov r10,rbp		; modulus 
-	mov rax,r14
-	mul rax
-	mov r15,[rdi+1*8]	; a.n[1]
-	add r14,r14		; r14=2*a.n[0]
-	mov r8,rax
-	and r10,rax		; only need lower qword
-	shrd r8,rdx,52
-	xor r9,r9
-
-	;; c+=2*a.n[0] * a.n[1]
-	mov rax,r14		; r14=2*a.n[0]
-	mul r15
-	mov rbx,[rdi+2*8]	; rbx=a.n[2]
-	mov r11,rbp 		; modulus
-	add r8,rax
-	adc r9,rdx
-	and r11,r8
-	shrd r8,r9,52
-	xor r9,r9
-	
-	;; c+=2*a.n[0]*a.n[2]+a.n[1]*a.n[1]
-	mov rax,r14
-	mul rbx
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,r15
-	mov r12,rbp		; modulus
-	mul rax
-	mov rcx,[rdi+3*8]	; rcx=a.n[3]
-	add r15,r15		; r15=a.n[1]*2
-	add r8,rax
-	adc r9,rdx
-	and r12,r8		; only need lower dword
-	shrd r8,r9,52
-	xor r9,r9		
-
-	;; c+=2*a.n[0]*a.n[3]+2*a.n[1]*a.n[2]
-	mov rax,r14
-	mul rcx
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,r15		; rax=2*a.n[1]
-	mov r13,rbp		; modulus
-	mul rbx
-	mov rsi,[rdi+4*8]	; rsi=a.n[4]
-	add r8,rax
-	adc r9,rdx
-	and r13,r8
-	shrd r8,r9,52
-	xor r9,r9		
-
-	;; c+=2*a.n[0]*a.n[4]+2*a.n[1]*a.n[3]+a.n[2]*a.n[2]
-	mov rax,r14		; last time we need 2*a.n[0]
-	mul rsi
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,r15
-	mul rcx
-	mov r14,rbp		; modulus
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,rbx
-	mul rax
-	add rbx,rbx		; rcx=2*a.n[2]
-	add r8,rax
-	adc r9,rdx
-	and r14,r8
-	shrd r8,r9,52
-	xor r9,r9		
-
-	;; c+=2*a.n[1]*a.n[4]+2*a.n[2]*a.n[3]
-	mov rax,r15		; last time we need 2*a.n[1]
-	mul rsi
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,rbx
-	mul rcx
-	mov r15,rbp		; modulus
-	add r8,rax
-	adc r9,rdx
-	and r15,r8
-	shrd r8,r9,52
-	xor r9,r9		
-
-	;; c+=2*a.n[2]*a.n[4]+a.n[3]*a.n[3]
-	mov rax,rbx		; last time we need 2*a.n[2]
-	mul rsi
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,rcx		; a.n[3]
-	mul rax
-	mov rbx,rbp		; modulus
-	add r8,rax
-	adc r9,rdx
-	and rbx,r8		; only need lower dword
-	lea rax,[2*rcx]
-	shrd r8,r9,52
-	xor r9,r9		
-
-	;; c+=2*a.n[3]*a.n[4]
-	mul rsi
-	mov rcx,rbp		; modulus
-	add r8,rax
-	adc r9,rdx
-	and rcx,r8		; only need lower dword
-	shrd r8,r9,52
-	xor r9,r9		
-
-	;; c+=a.n[4]*a.n[4]
-	mov rax,rsi
-	mul rax
-	;; mov rbp,rbp		; modulus is already there!
-	add r8,rax
-	adc r9,rdx
-	and rbp,r8 
-	shrd r8,r9,52
-	xor r9,r9		
-
-	mov rsi,r8
-
-	;; *******************************************************
-	jmp common_exit_norm
-	end
-
-	
diff --git a/src/field_5x52_asm_impl.h b/src/field_5x52_asm_impl.h
index f29605b11..98cc004bf 100644
--- a/src/field_5x52_asm_impl.h
+++ b/src/field_5x52_asm_impl.h
@@ -1,13 +1,502 @@
 /**********************************************************************
- * Copyright (c) 2013 Pieter Wuille                                   *
+ * Copyright (c) 2013-2014 Diederik Huys, Pieter Wuille               *
  * Distributed under the MIT software license, see the accompanying   *
  * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
  **********************************************************************/
 
+/**
+ * Changelog:
+ * - March 2013, Diederik Huys:    original version
+ * - November 2014, Pieter Wuille: updated to use Peter Dettman's parallel multiplication algorithm
+ * - December 2014, Pieter Wuille: converted from YASM to GCC inline assembly
+ */
+
 #ifndef _SECP256K1_FIELD_INNER5X52_IMPL_H_
 #define _SECP256K1_FIELD_INNER5X52_IMPL_H_
 
-void __attribute__ ((sysv_abi)) secp256k1_fe_mul_inner(const uint64_t *a, const uint64_t *b, uint64_t *r);
-void __attribute__ ((sysv_abi)) secp256k1_fe_sqr_inner(const uint64_t *a, uint64_t *r);
+SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) {
+/**
+ * Registers: rdx:rax = multiplication accumulator
+ *            r9:r8   = c
+ *            r15:rcx = d
+ *            r10-r14 = a0-a4
+ *            rbx     = b
+ *            rdi     = r
+ *            rsi     = a / t?
+ */
+  uint64_t tmp1, tmp2, tmp3;
+__asm__ __volatile__(
+    "movq 0(%%rsi),%%r10\n"
+    "movq 8(%%rsi),%%r11\n"
+    "movq 16(%%rsi),%%r12\n"
+    "movq 24(%%rsi),%%r13\n"
+    "movq 32(%%rsi),%%r14\n"
+
+    /* d += a3 * b0 */
+    "movq 0(%%rbx),%%rax\n"
+    "mulq %%r13\n"
+    "movq %%rax,%%rcx\n"
+    "movq %%rdx,%%r15\n"
+    /* d += a2 * b1 */
+    "movq 8(%%rbx),%%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a1 * b2 */
+    "movq 16(%%rbx),%%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d = a0 * b3 */
+    "movq 24(%%rbx),%%rax\n"
+    "mulq %%r10\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* c = a4 * b4 */
+    "movq 32(%%rbx),%%rax\n"
+    "mulq %%r14\n"
+    "movq %%rax,%%r8\n"
+    "movq %%rdx,%%r9\n"
+    /* d += (c & M) * R */
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%%rax\n"
+    "movq $0x1000003d10,%%rdx\n"
+    "mulq %%rdx\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* c >>= 52 (%%r8 only) */
+    "shrdq $52,%%r9,%%r8\n"
+    /* t3 (tmp1) = d & M */
+    "movq %%rcx,%%rsi\n"
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%%rsi\n"
+    "movq %%rsi,%q1\n"
+    /* d >>= 52 */
+    "shrdq $52,%%r15,%%rcx\n"
+    "xorq %%r15,%%r15\n"
+    /* d += a4 * b0 */
+    "movq 0(%%rbx),%%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a3 * b1 */
+    "movq 8(%%rbx),%%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a2 * b2 */
+    "movq 16(%%rbx),%%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a1 * b3 */
+    "movq 24(%%rbx),%%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a0 * b4 */
+    "movq 32(%%rbx),%%rax\n"
+    "mulq %%r10\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += c * R */
+    "movq %%r8,%%rax\n"
+    "movq $0x1000003d10,%%rdx\n"
+    "mulq %%rdx\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* t4 = d & M (%%rsi) */
+    "movq %%rcx,%%rsi\n"
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%%rsi\n"
+    /* d >>= 52 */
+    "shrdq $52,%%r15,%%rcx\n"
+    "xorq %%r15,%%r15\n"
+    /* tx = t4 >> 48 (tmp3) */
+    "movq %%rsi,%%rax\n"
+    "shrq $48,%%rax\n"
+    "movq %%rax,%q3\n"
+    /* t4 &= (M >> 4) (tmp2) */
+    "movq $0xffffffffffff,%%rax\n"
+    "andq %%rax,%%rsi\n"
+    "movq %%rsi,%q2\n"
+    /* c = a0 * b0 */
+    "movq 0(%%rbx),%%rax\n"
+    "mulq %%r10\n"
+    "movq %%rax,%%r8\n"
+    "movq %%rdx,%%r9\n"
+    /* d += a4 * b1 */
+    "movq 8(%%rbx),%%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a3 * b2 */
+    "movq 16(%%rbx),%%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a2 * b3 */
+    "movq 24(%%rbx),%%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a1 * b4 */
+    "movq 32(%%rbx),%%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* u0 = d & M (%%rsi) */
+    "movq %%rcx,%%rsi\n"
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%%rsi\n"
+    /* d >>= 52 */
+    "shrdq $52,%%r15,%%rcx\n"
+    "xorq %%r15,%%r15\n"
+    /* u0 = (u0 << 4) | tx (%%rsi) */
+    "shlq $4,%%rsi\n"
+    "movq %q3,%%rax\n"
+    "orq %%rax,%%rsi\n"
+    /* c += u0 * (R >> 4) */
+    "movq $0x1000003d1,%%rax\n"
+    "mulq %%rsi\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* r[0] = c & M */
+    "movq %%r8,%%rax\n"
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%%rax\n"
+    "movq %%rax,0(%%rdi)\n"
+    /* c >>= 52 */
+    "shrdq $52,%%r9,%%r8\n"
+    "xorq %%r9,%%r9\n"
+    /* c += a1 * b0 */
+    "movq 0(%%rbx),%%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* c += a0 * b1 */
+    "movq 8(%%rbx),%%rax\n"
+    "mulq %%r10\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* d += a4 * b2 */
+    "movq 16(%%rbx),%%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a3 * b3 */
+    "movq 24(%%rbx),%%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a2 * b4 */
+    "movq 32(%%rbx),%%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* c += (d & M) * R */
+    "movq %%rcx,%%rax\n"
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%%rax\n"
+    "movq $0x1000003d10,%%rdx\n"
+    "mulq %%rdx\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* d >>= 52 */
+    "shrdq $52,%%r15,%%rcx\n"
+    "xorq %%r15,%%r15\n"
+    /* r[1] = c & M */
+    "movq %%r8,%%rax\n"
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%%rax\n"
+    "movq %%rax,8(%%rdi)\n"
+    /* c >>= 52 */
+    "shrdq $52,%%r9,%%r8\n"
+    "xorq %%r9,%%r9\n"
+    /* c += a2 * b0 */
+    "movq 0(%%rbx),%%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* c += a1 * b1 */
+    "movq 8(%%rbx),%%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* c += a0 * b2 (last use of %%r10 = a0) */
+    "movq 16(%%rbx),%%rax\n"
+    "mulq %%r10\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* fetch t3 (%%r10, overwrites a0), t4 (%%rsi) */
+    "movq %q2,%%rsi\n"
+    "movq %q1,%%r10\n"
+    /* d += a4 * b3 */
+    "movq 24(%%rbx),%%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a3 * b4 */
+    "movq 32(%%rbx),%%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* c += (d & M) * R */
+    "movq %%rcx,%%rax\n"
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%%rax\n"
+    "movq $0x1000003d10,%%rdx\n"
+    "mulq %%rdx\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* d >>= 52 (%%rcx only) */
+    "shrdq $52,%%r15,%%rcx\n"
+    /* r[2] = c & M */
+    "movq %%r8,%%rax\n"
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%%rax\n"
+    "movq %%rax,16(%%rdi)\n"
+    /* c >>= 52 */
+    "shrdq $52,%%r9,%%r8\n"
+    "xorq %%r9,%%r9\n"
+    /* c += t3 */
+    "addq %%r10,%%r8\n"
+    /* c += d * R */
+    "movq %%rcx,%%rax\n"
+    "movq $0x1000003d10,%%rdx\n"
+    "mulq %%rdx\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* r[3] = c & M */
+    "movq %%r8,%%rax\n"
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%%rax\n"
+    "movq %%rax,24(%%rdi)\n"
+    /* c >>= 52 (%%r8 only) */
+    "shrdq $52,%%r9,%%r8\n"
+    /* c += t4 (%%r8 only) */
+    "addq %%rsi,%%r8\n"
+    /* r[4] = c */
+    "movq %%r8,32(%%rdi)\n"
+: "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3)
+: "b"(b), "D"(r)
+: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory"
+);
+}
+
+SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) {
+/**
+ * Registers: rdx:rax = multiplication accumulator
+ *            r9:r8   = c
+ *            rcx:rbx = d
+ *            r10-r14 = a0-a4
+ *            r15     = M (0xfffffffffffff)
+ *            rdi     = r
+ *            rsi     = a / t?
+ */
+  uint64_t tmp1, tmp2, tmp3;
+__asm__ __volatile__(
+    "movq 0(%%rsi),%%r10\n"
+    "movq 8(%%rsi),%%r11\n"
+    "movq 16(%%rsi),%%r12\n"
+    "movq 24(%%rsi),%%r13\n"
+    "movq 32(%%rsi),%%r14\n"
+    "movq $0xfffffffffffff,%%r15\n"
+
+    /* d = (a0*2) * a3 */
+    "leaq (%%r10,%%r10,1),%%rax\n"
+    "mulq %%r13\n"
+    "movq %%rax,%%rbx\n"
+    "movq %%rdx,%%rcx\n"
+    /* d += (a1*2) * a2 */
+    "leaq (%%r11,%%r11,1),%%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* c = a4 * a4 */
+    "movq %%r14,%%rax\n"
+    "mulq %%r14\n"
+    "movq %%rax,%%r8\n"
+    "movq %%rdx,%%r9\n"
+    /* d += (c & M) * R */
+    "andq %%r15,%%rax\n"
+    "movq $0x1000003d10,%%rdx\n"
+    "mulq %%rdx\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* c >>= 52 (%%r8 only) */
+    "shrdq $52,%%r9,%%r8\n"
+    /* t3 (tmp1) = d & M */
+    "movq %%rbx,%%rsi\n"
+    "andq %%r15,%%rsi\n"
+    "movq %%rsi,%q1\n"
+    /* d >>= 52 */
+    "shrdq $52,%%rcx,%%rbx\n"
+    "xorq %%rcx,%%rcx\n"
+    /* a4 *= 2 */
+    "addq %%r14,%%r14\n"
+    /* d += a0 * a4 */
+    "movq %%r10,%%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* d+= (a1*2) * a3 */
+    "leaq (%%r11,%%r11,1),%%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* d += a2 * a2 */
+    "movq %%r12,%%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* d += c * R */
+    "movq %%r8,%%rax\n"
+    "movq $0x1000003d10,%%rdx\n"
+    "mulq %%rdx\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* t4 = d & M (%%rsi) */
+    "movq %%rbx,%%rsi\n"
+    "andq %%r15,%%rsi\n"
+    /* d >>= 52 */
+    "shrdq $52,%%rcx,%%rbx\n"
+    "xorq %%rcx,%%rcx\n"
+    /* tx = t4 >> 48 (tmp3) */
+    "movq %%rsi,%%rax\n"
+    "shrq $48,%%rax\n"
+    "movq %%rax,%q3\n"
+    /* t4 &= (M >> 4) (tmp2) */
+    "movq $0xffffffffffff,%%rax\n"
+    "andq %%rax,%%rsi\n"
+    "movq %%rsi,%q2\n"
+    /* c = a0 * a0 */
+    "movq %%r10,%%rax\n"
+    "mulq %%r10\n"
+    "movq %%rax,%%r8\n"
+    "movq %%rdx,%%r9\n"
+    /* d += a1 * a4 */
+    "movq %%r11,%%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* d += (a2*2) * a3 */
+    "leaq (%%r12,%%r12,1),%%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* u0 = d & M (%%rsi) */
+    "movq %%rbx,%%rsi\n"
+    "andq %%r15,%%rsi\n"
+    /* d >>= 52 */
+    "shrdq $52,%%rcx,%%rbx\n"
+    "xorq %%rcx,%%rcx\n"
+    /* u0 = (u0 << 4) | tx (%%rsi) */
+    "shlq $4,%%rsi\n"
+    "movq %q3,%%rax\n"
+    "orq %%rax,%%rsi\n"
+    /* c += u0 * (R >> 4) */
+    "movq $0x1000003d1,%%rax\n"
+    "mulq %%rsi\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* r[0] = c & M */
+    "movq %%r8,%%rax\n"
+    "andq %%r15,%%rax\n"
+    "movq %%rax,0(%%rdi)\n"
+    /* c >>= 52 */
+    "shrdq $52,%%r9,%%r8\n"
+    "xorq %%r9,%%r9\n"
+    /* a0 *= 2 */
+    "addq %%r10,%%r10\n"
+    /* c += a0 * a1 */
+    "movq %%r10,%%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* d += a2 * a4 */
+    "movq %%r12,%%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* d += a3 * a3 */
+    "movq %%r13,%%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* c += (d & M) * R */
+    "movq %%rbx,%%rax\n"
+    "andq %%r15,%%rax\n"
+    "movq $0x1000003d10,%%rdx\n"
+    "mulq %%rdx\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* d >>= 52 */
+    "shrdq $52,%%rcx,%%rbx\n"
+    "xorq %%rcx,%%rcx\n"
+    /* r[1] = c & M */
+    "movq %%r8,%%rax\n"
+    "andq %%r15,%%rax\n"
+    "movq %%rax,8(%%rdi)\n"
+    /* c >>= 52 */
+    "shrdq $52,%%r9,%%r8\n"
+    "xorq %%r9,%%r9\n"
+    /* c += a0 * a2 (last use of %%r10) */
+    "movq %%r10,%%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* fetch t3 (%%r10, overwrites a0),t4 (%%rsi) */
+    "movq %q2,%%rsi\n"
+    "movq %q1,%%r10\n"
+    /* c += a1 * a1 */
+    "movq %%r11,%%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* d += a3 * a4 */
+    "movq %%r13,%%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* c += (d & M) * R */
+    "movq %%rbx,%%rax\n"
+    "andq %%r15,%%rax\n"
+    "movq $0x1000003d10,%%rdx\n"
+    "mulq %%rdx\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* d >>= 52 (%%rbx only) */
+    "shrdq $52,%%rcx,%%rbx\n"
+    /* r[2] = c & M */
+    "movq %%r8,%%rax\n"
+    "andq %%r15,%%rax\n"
+    "movq %%rax,16(%%rdi)\n"
+    /* c >>= 52 */
+    "shrdq $52,%%r9,%%r8\n"
+    "xorq %%r9,%%r9\n"
+    /* c += t3 */
+    "addq %%r10,%%r8\n"
+    /* c += d * R */
+    "movq %%rbx,%%rax\n"
+    "movq $0x1000003d10,%%rdx\n"
+    "mulq %%rdx\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* r[3] = c & M */
+    "movq %%r8,%%rax\n"
+    "andq %%r15,%%rax\n"
+    "movq %%rax,24(%%rdi)\n"
+    /* c >>= 52 (%%r8 only) */
+    "shrdq $52,%%r9,%%r8\n"
+    /* c += t4 (%%r8 only) */
+    "addq %%rsi,%%r8\n"
+    /* r[4] = c */
+    "movq %%r8,32(%%rdi)\n"
+: "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3)
+: "D"(r)
+: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory"
+);
+}
 
 #endif
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index 75b210eaf..63176d6de 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -102,6 +102,50 @@ static void secp256k1_fe_normalize(secp256k1_fe_t *r) {
 #endif
 }
 
+static void secp256k1_fe_normalize_var(secp256k1_fe_t *r) {
+    uint64_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4];
+
+    /* Reduce t4 at the start so there will be at most a single carry from the first pass */
+    uint64_t x = t4 >> 48; t4 &= 0x0FFFFFFFFFFFFULL;
+    uint64_t m;
+
+    /* The first pass ensures the magnitude is 1, ... */
+    t0 += x * 0x1000003D1ULL;
+    t1 += (t0 >> 52); t0 &= 0xFFFFFFFFFFFFFULL;
+    t2 += (t1 >> 52); t1 &= 0xFFFFFFFFFFFFFULL; m = t1;
+    t3 += (t2 >> 52); t2 &= 0xFFFFFFFFFFFFFULL; m &= t2;
+    t4 += (t3 >> 52); t3 &= 0xFFFFFFFFFFFFFULL; m &= t3;
+
+    /* ... except for a possible carry at bit 48 of t4 (i.e. bit 256 of the field element) */
+    VERIFY_CHECK(t4 >> 49 == 0);
+
+    /* At most a single final reduction is needed; check if the value is >= the field characteristic */
+    x = (t4 >> 48) | ((t4 == 0x0FFFFFFFFFFFFULL) & (m == 0xFFFFFFFFFFFFFULL)
+        & (t0 >= 0xFFFFEFFFFFC2FULL));
+
+    if (x) {
+        t0 += 0x1000003D1ULL;
+        t1 += (t0 >> 52); t0 &= 0xFFFFFFFFFFFFFULL;
+        t2 += (t1 >> 52); t1 &= 0xFFFFFFFFFFFFFULL;
+        t3 += (t2 >> 52); t2 &= 0xFFFFFFFFFFFFFULL;
+        t4 += (t3 >> 52); t3 &= 0xFFFFFFFFFFFFFULL;
+
+        /* If t4 didn't carry to bit 48 already, then it should have after any final reduction */
+        VERIFY_CHECK(t4 >> 48 == x);
+
+        /* Mask off the possible multiple of 2^256 from the final reduction */
+        t4 &= 0x0FFFFFFFFFFFFULL;
+    }
+
+    r->n[0] = t0; r->n[1] = t1; r->n[2] = t2; r->n[3] = t3; r->n[4] = t4;
+
+#ifdef VERIFY
+    r->magnitude = 1;
+    r->normalized = 1;
+    secp256k1_fe_verify(r);
+#endif
+}
+
 SECP256K1_INLINE static void secp256k1_fe_set_int(secp256k1_fe_t *r, int a) {
     r->n[0] = a;
     r->n[1] = r->n[2] = r->n[3] = r->n[4] = 0;
@@ -255,7 +299,7 @@ static void secp256k1_fe_mul(secp256k1_fe_t *r, const secp256k1_fe_t *a, const s
     secp256k1_fe_verify(b);
     VERIFY_CHECK(r != b);
 #endif
-    secp256k1_fe_mul_inner(a->n, b->n, r->n);
+    secp256k1_fe_mul_inner(r->n, a->n, b->n);
 #ifdef VERIFY
     r->magnitude = 1;
     r->normalized = 0;
@@ -268,7 +312,7 @@ static void secp256k1_fe_sqr(secp256k1_fe_t *r, const secp256k1_fe_t *a) {
     VERIFY_CHECK(a->magnitude <= 8);
     secp256k1_fe_verify(a);
 #endif
-    secp256k1_fe_sqr_inner(a->n, r->n);
+    secp256k1_fe_sqr_inner(r->n, a->n);
 #ifdef VERIFY
     r->magnitude = 1;
     r->normalized = 0;
diff --git a/src/field_5x52_int128_impl.h b/src/field_5x52_int128_impl.h
index e552fb431..ec631833c 100644
--- a/src/field_5x52_int128_impl.h
+++ b/src/field_5x52_int128_impl.h
@@ -15,7 +15,7 @@
 #define VERIFY_BITS(x, n) do { } while(0)
 #endif
 
-SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b, uint64_t *r) {
+SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) {
     VERIFY_BITS(a[0], 56);
     VERIFY_BITS(a[1], 56);
     VERIFY_BITS(a[2], 56);
@@ -152,7 +152,7 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint64_t *a, const uin
     /* [r4 r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
 }
 
-SECP256K1_INLINE static void secp256k1_fe_sqr_inner(const uint64_t *a, uint64_t *r) {
+SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) {
     VERIFY_BITS(a[0], 56);
     VERIFY_BITS(a[1], 56);
     VERIFY_BITS(a[2], 56);
diff --git a/src/field_gmp_impl.h b/src/field_gmp_impl.h
index 8af7dd68f..73a55c4f0 100644
--- a/src/field_gmp_impl.h
+++ b/src/field_gmp_impl.h
@@ -46,6 +46,10 @@ static void secp256k1_fe_normalize(secp256k1_fe_t *r) {
         mpn_sub(r->n, r->n, FIELD_LIMBS, secp256k1_field_p, FIELD_LIMBS);
 }
 
+static void secp256k1_fe_normalize_var(secp256k1_fe_t *r) {
+    secp256k1_fe_normalize(r);
+}
+
 SECP256K1_INLINE static void secp256k1_fe_set_int(secp256k1_fe_t *r, int a) {
     r->n[0] = a;
     for (int i=1; i<FIELD_LIMBS+1; i++)
diff --git a/src/field_impl.h b/src/field_impl.h
index 4d25e5371..24d3104ed 100644
--- a/src/field_impl.h
+++ b/src/field_impl.h
@@ -66,7 +66,7 @@ static int secp256k1_fe_set_hex(secp256k1_fe_t *r, const char *a, int alen) {
     return secp256k1_fe_set_b32(r, tmp);
 }
 
-static int secp256k1_fe_sqrt(secp256k1_fe_t *r, const secp256k1_fe_t *a) {
+static int secp256k1_fe_sqrt_var(secp256k1_fe_t *r, const secp256k1_fe_t *a) {
 
     /** The binary representation of (p + 1)/4 has 3 blocks of 1s, with lengths in
      *  { 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
@@ -132,7 +132,7 @@ static int secp256k1_fe_sqrt(secp256k1_fe_t *r, const secp256k1_fe_t *a) {
     secp256k1_fe_sqr(&t1, r);
     secp256k1_fe_negate(&t1, &t1, 1);
     secp256k1_fe_add(&t1, a);
-    secp256k1_fe_normalize(&t1);
+    secp256k1_fe_normalize_var(&t1);
     return secp256k1_fe_is_zero(&t1);
 }
 
@@ -206,7 +206,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe_t *r, const secp256k1_fe_t *a) {
 #elif defined(USE_FIELD_INV_NUM)
     unsigned char b[32];
     secp256k1_fe_t c = *a;
-    secp256k1_fe_normalize(&c);
+    secp256k1_fe_normalize_var(&c);
     secp256k1_fe_get_b32(b, &c);
     secp256k1_num_t n;
     secp256k1_num_set_bin(&n, b, 32);
@@ -218,30 +218,6 @@ static void secp256k1_fe_inv_var(secp256k1_fe_t *r, const secp256k1_fe_t *a) {
 #endif
 }
 
-static void secp256k1_fe_inv_all(size_t len, secp256k1_fe_t r[len], const secp256k1_fe_t a[len]) {
-    if (len < 1)
-        return;
-
-    VERIFY_CHECK((r + len <= a) || (a + len <= r));
-
-    r[0] = a[0];
-
-    size_t i = 0;
-    while (++i < len) {
-        secp256k1_fe_mul(&r[i], &r[i - 1], &a[i]);
-    }
-
-    secp256k1_fe_t u; secp256k1_fe_inv(&u, &r[--i]);
-
-    while (i > 0) {
-        int j = i--;
-        secp256k1_fe_mul(&r[j], &r[i], &u);
-        secp256k1_fe_mul(&u, &u, &a[j]);
-    }
-
-    r[0] = u;
-}
-
 static void secp256k1_fe_inv_all_var(size_t len, secp256k1_fe_t r[len], const secp256k1_fe_t a[len]) {
     if (len < 1)
         return;
@@ -277,7 +253,7 @@ static void secp256k1_fe_start(void) {
 #endif
     if (secp256k1_fe_consts == NULL) {
         secp256k1_fe_inner_start();
-        secp256k1_fe_consts_t *ret = (secp256k1_fe_consts_t*)malloc(sizeof(secp256k1_fe_consts_t));
+        secp256k1_fe_consts_t *ret = (secp256k1_fe_consts_t*)checked_malloc(sizeof(secp256k1_fe_consts_t));
 #ifndef USE_NUM_NONE
         secp256k1_num_set_bin(&ret->p, secp256k1_fe_consts_p, sizeof(secp256k1_fe_consts_p));
 #endif
diff --git a/src/group.h b/src/group.h
index 0f14bd25f..ecfebcdc0 100644
--- a/src/group.h
+++ b/src/group.h
@@ -51,15 +51,16 @@ static void secp256k1_ge_set_xy(secp256k1_ge_t *r, const secp256k1_fe_t *x, cons
 
 /** Set a group element (affine) equal to the point with the given X coordinate, and given oddness
  *  for Y. Return value indicates whether the result is valid. */
-static int secp256k1_ge_set_xo(secp256k1_ge_t *r, const secp256k1_fe_t *x, int odd);
+static int secp256k1_ge_set_xo_var(secp256k1_ge_t *r, const secp256k1_fe_t *x, int odd);
 
 /** Check whether a group element is the point at infinity. */
 static int secp256k1_ge_is_infinity(const secp256k1_ge_t *a);
 
 /** Check whether a group element is valid (i.e., on the curve). */
-static int secp256k1_ge_is_valid(const secp256k1_ge_t *a);
+static int secp256k1_ge_is_valid_var(const secp256k1_ge_t *a);
 
 static void secp256k1_ge_neg(secp256k1_ge_t *r, const secp256k1_ge_t *a);
+static void secp256k1_ge_neg_var(secp256k1_ge_t *r, const secp256k1_ge_t *a);
 
 /** Get a hex representation of a point. *rlen will be overwritten with the real length. */
 static void secp256k1_ge_get_hex(char *r, int *rlen, const secp256k1_ge_t *a);
@@ -84,7 +85,7 @@ static void secp256k1_gej_set_ge(secp256k1_gej_t *r, const secp256k1_ge_t *a);
 static void secp256k1_gej_get_x_var(secp256k1_fe_t *r, const secp256k1_gej_t *a);
 
 /** Set r equal to the inverse of a (i.e., mirrored around the X axis) */
-static void secp256k1_gej_neg(secp256k1_gej_t *r, const secp256k1_gej_t *a);
+static void secp256k1_gej_neg_var(secp256k1_gej_t *r, const secp256k1_gej_t *a);
 
 /** Check whether a group element is the point at infinity. */
 static int secp256k1_gej_is_infinity(const secp256k1_gej_t *a);
diff --git a/src/group_impl.h b/src/group_impl.h
index cbd0d8c4f..1ab5d5fe7 100644
--- a/src/group_impl.h
+++ b/src/group_impl.h
@@ -28,13 +28,17 @@ static int secp256k1_ge_is_infinity(const secp256k1_ge_t *a) {
 }
 
 static void secp256k1_ge_neg(secp256k1_ge_t *r, const secp256k1_ge_t *a) {
-    r->infinity = a->infinity;
-    r->x = a->x;
-    r->y = a->y;
+    *r = *a;
     secp256k1_fe_normalize(&r->y);
     secp256k1_fe_negate(&r->y, &r->y, 1);
 }
 
+static void secp256k1_ge_neg_var(secp256k1_ge_t *r, const secp256k1_ge_t *a) {
+    *r = *a;
+    secp256k1_fe_normalize_var(&r->y);
+    secp256k1_fe_negate(&r->y, &r->y, 1);
+}
+
 static void secp256k1_ge_get_hex(char *r, int *rlen, const secp256k1_ge_t *a) {
     char cx[65]; int lx=65;
     char cy[65]; int ly=65;
@@ -85,15 +89,16 @@ static void secp256k1_ge_set_gej_var(secp256k1_ge_t *r, secp256k1_gej_t *a) {
 
 static void secp256k1_ge_set_all_gej_var(size_t len, secp256k1_ge_t r[len], const secp256k1_gej_t a[len]) {
     size_t count = 0;
-    secp256k1_fe_t az[len];
+    secp256k1_fe_t *az = checked_malloc(sizeof(secp256k1_fe_t) * len);
     for (size_t i=0; i<len; i++) {
         if (!a[i].infinity) {
             az[count++] = a[i].z;
         }
     }
 
-    secp256k1_fe_t azi[count];
+    secp256k1_fe_t *azi = checked_malloc(sizeof(secp256k1_fe_t) * count);
     secp256k1_fe_inv_all_var(count, azi, az);
+    free(az);
 
     count = 0;
     for (size_t i=0; i<len; i++) {
@@ -106,6 +111,7 @@ static void secp256k1_ge_set_all_gej_var(size_t len, secp256k1_ge_t r[len], cons
             secp256k1_fe_mul(&r[i].y, &a[i].y, &zi3);
         }
     }
+    free(azi);
 }
 
 static void secp256k1_gej_set_infinity(secp256k1_gej_t *r) {
@@ -135,16 +141,16 @@ static void secp256k1_ge_clear(secp256k1_ge_t *r) {
     secp256k1_fe_clear(&r->y);
 }
 
-static int secp256k1_ge_set_xo(secp256k1_ge_t *r, const secp256k1_fe_t *x, int odd) {
+static int secp256k1_ge_set_xo_var(secp256k1_ge_t *r, const secp256k1_fe_t *x, int odd) {
     r->x = *x;
     secp256k1_fe_t x2; secp256k1_fe_sqr(&x2, x);
     secp256k1_fe_t x3; secp256k1_fe_mul(&x3, x, &x2);
     r->infinity = 0;
     secp256k1_fe_t c; secp256k1_fe_set_int(&c, 7);
     secp256k1_fe_add(&c, &x3);
-    if (!secp256k1_fe_sqrt(&r->y, &c))
+    if (!secp256k1_fe_sqrt_var(&r->y, &c))
         return 0;
-    secp256k1_fe_normalize(&r->y);
+    secp256k1_fe_normalize_var(&r->y);
     if (secp256k1_fe_is_odd(&r->y) != odd)
         secp256k1_fe_negate(&r->y, &r->y, 1);
     return 1;
@@ -162,12 +168,12 @@ static void secp256k1_gej_get_x_var(secp256k1_fe_t *r, const secp256k1_gej_t *a)
     secp256k1_fe_mul(r, &a->x, &zi2);
 }
 
-static void secp256k1_gej_neg(secp256k1_gej_t *r, const secp256k1_gej_t *a) {
+static void secp256k1_gej_neg_var(secp256k1_gej_t *r, const secp256k1_gej_t *a) {
     r->infinity = a->infinity;
     r->x = a->x;
     r->y = a->y;
     r->z = a->z;
-    secp256k1_fe_normalize(&r->y);
+    secp256k1_fe_normalize_var(&r->y);
     secp256k1_fe_negate(&r->y, &r->y, 1);
 }
 
@@ -175,7 +181,7 @@ static int secp256k1_gej_is_infinity(const secp256k1_gej_t *a) {
     return a->infinity;
 }
 
-static int secp256k1_gej_is_valid(const secp256k1_gej_t *a) {
+static int secp256k1_gej_is_valid_var(const secp256k1_gej_t *a) {
     if (a->infinity)
         return 0;
     /** y^2 = x^3 + 7
@@ -189,12 +195,12 @@ static int secp256k1_gej_is_valid(const secp256k1_gej_t *a) {
     secp256k1_fe_t z6; secp256k1_fe_sqr(&z6, &z2); secp256k1_fe_mul(&z6, &z6, &z2);
     secp256k1_fe_mul_int(&z6, 7);
     secp256k1_fe_add(&x3, &z6);
-    secp256k1_fe_normalize(&y2);
-    secp256k1_fe_normalize(&x3);
+    secp256k1_fe_normalize_var(&y2);
+    secp256k1_fe_normalize_var(&x3);
     return secp256k1_fe_equal(&y2, &x3);
 }
 
-static int secp256k1_ge_is_valid(const secp256k1_ge_t *a) {
+static int secp256k1_ge_is_valid_var(const secp256k1_ge_t *a) {
     if (a->infinity)
         return 0;
     /* y^2 = x^3 + 7 */
@@ -202,8 +208,8 @@ static int secp256k1_ge_is_valid(const secp256k1_ge_t *a) {
     secp256k1_fe_t x3; secp256k1_fe_sqr(&x3, &a->x); secp256k1_fe_mul(&x3, &x3, &a->x);
     secp256k1_fe_t c; secp256k1_fe_set_int(&c, 7);
     secp256k1_fe_add(&x3, &c);
-    secp256k1_fe_normalize(&y2);
-    secp256k1_fe_normalize(&x3);
+    secp256k1_fe_normalize_var(&y2);
+    secp256k1_fe_normalize_var(&x3);
     return secp256k1_fe_equal(&y2, &x3);
 }
 
@@ -255,11 +261,11 @@ static void secp256k1_gej_add_var(secp256k1_gej_t *r, const secp256k1_gej_t *a,
     secp256k1_fe_t u2; secp256k1_fe_mul(&u2, &b->x, &z12);
     secp256k1_fe_t s1; secp256k1_fe_mul(&s1, &a->y, &z22); secp256k1_fe_mul(&s1, &s1, &b->z);
     secp256k1_fe_t s2; secp256k1_fe_mul(&s2, &b->y, &z12); secp256k1_fe_mul(&s2, &s2, &a->z);
-    secp256k1_fe_normalize(&u1);
-    secp256k1_fe_normalize(&u2);
+    secp256k1_fe_normalize_var(&u1);
+    secp256k1_fe_normalize_var(&u2);
     if (secp256k1_fe_equal(&u1, &u2)) {
-        secp256k1_fe_normalize(&s1);
-        secp256k1_fe_normalize(&s2);
+        secp256k1_fe_normalize_var(&s1);
+        secp256k1_fe_normalize_var(&s2);
         if (secp256k1_fe_equal(&s1, &s2)) {
             secp256k1_gej_double_var(r, a);
         } else {
@@ -294,15 +300,14 @@ static void secp256k1_gej_add_ge_var(secp256k1_gej_t *r, const secp256k1_gej_t *
     }
     r->infinity = 0;
     secp256k1_fe_t z12; secp256k1_fe_sqr(&z12, &a->z);
-    secp256k1_fe_t u1 = a->x; secp256k1_fe_normalize(&u1);
+    secp256k1_fe_t u1 = a->x;
     secp256k1_fe_t u2; secp256k1_fe_mul(&u2, &b->x, &z12);
-    secp256k1_fe_t s1 = a->y; secp256k1_fe_normalize(&s1);
+    secp256k1_fe_t s1 = a->y; secp256k1_fe_normalize_var(&s1);
     secp256k1_fe_t s2; secp256k1_fe_mul(&s2, &b->y, &z12); secp256k1_fe_mul(&s2, &s2, &a->z);
-    secp256k1_fe_normalize(&u1);
-    secp256k1_fe_normalize(&u2);
+    secp256k1_fe_normalize_var(&u1);
+    secp256k1_fe_normalize_var(&u2);
     if (secp256k1_fe_equal(&u1, &u2)) {
-        secp256k1_fe_normalize(&s1);
-        secp256k1_fe_normalize(&s2);
+        secp256k1_fe_normalize_var(&s2);
         if (secp256k1_fe_equal(&s1, &s2)) {
             secp256k1_gej_double_var(r, a);
         } else {
@@ -434,7 +439,7 @@ static void secp256k1_ge_start(void) {
     };
 #endif
     if (secp256k1_ge_consts == NULL) {
-        secp256k1_ge_consts_t *ret = (secp256k1_ge_consts_t*)malloc(sizeof(secp256k1_ge_consts_t));
+        secp256k1_ge_consts_t *ret = (secp256k1_ge_consts_t*)checked_malloc(sizeof(secp256k1_ge_consts_t));
 #ifdef USE_ENDOMORPHISM
         VERIFY_CHECK(secp256k1_fe_set_b32(&ret->beta, secp256k1_ge_consts_beta));
 #endif
diff --git a/src/scalar_impl.h b/src/scalar_impl.h
index 7fc159df7..4408cce2d 100644
--- a/src/scalar_impl.h
+++ b/src/scalar_impl.h
@@ -40,7 +40,7 @@ static void secp256k1_scalar_start(void) {
         return;
 
     /* Allocate. */
-    secp256k1_scalar_consts_t *ret = (secp256k1_scalar_consts_t*)malloc(sizeof(secp256k1_scalar_consts_t));
+    secp256k1_scalar_consts_t *ret = (secp256k1_scalar_consts_t*)checked_malloc(sizeof(secp256k1_scalar_consts_t));
 
 #ifndef USE_NUM_NONE
     static const unsigned char secp256k1_scalar_consts_order[] = {
diff --git a/src/secp256k1.c b/src/secp256k1.c
index 20fc27df7..0328db88f 100644
--- a/src/secp256k1.c
+++ b/src/secp256k1.c
@@ -40,15 +40,12 @@ void secp256k1_stop(void) {
     secp256k1_fe_stop();
 }
 
-int secp256k1_ecdsa_verify(const unsigned char *msg, int msglen, const unsigned char *sig, int siglen, const unsigned char *pubkey, int pubkeylen) {
+int secp256k1_ecdsa_verify(const unsigned char *msg32, const unsigned char *sig, int siglen, const unsigned char *pubkey, int pubkeylen) {
     DEBUG_CHECK(secp256k1_ecmult_consts != NULL);
-    DEBUG_CHECK(msg != NULL);
-    DEBUG_CHECK(msglen <= 32);
+    DEBUG_CHECK(msg32 != NULL);
     DEBUG_CHECK(sig != NULL);
     DEBUG_CHECK(pubkey != NULL);
 
-    unsigned char msg32[32] = {0};
-    memcpy(msg32 + 32 - msglen, msg, msglen);
     int ret = -3;
     secp256k1_scalar_t m;
     secp256k1_ecdsa_sig_t s;
@@ -72,10 +69,9 @@ end:
     return ret;
 }
 
-int secp256k1_ecdsa_sign(const unsigned char *message, int messagelen, unsigned char *signature, int *signaturelen, const unsigned char *seckey, const unsigned char *nonce) {
+int secp256k1_ecdsa_sign(const unsigned char *msg32, unsigned char *signature, int *signaturelen, const unsigned char *seckey, const unsigned char *nonce) {
     DEBUG_CHECK(secp256k1_ecmult_gen_consts != NULL);
-    DEBUG_CHECK(message != NULL);
-    DEBUG_CHECK(messagelen <= 32);
+    DEBUG_CHECK(msg32 != NULL);
     DEBUG_CHECK(signature != NULL);
     DEBUG_CHECK(signaturelen != NULL);
     DEBUG_CHECK(seckey != NULL);
@@ -85,12 +81,7 @@ int secp256k1_ecdsa_sign(const unsigned char *message, int messagelen, unsigned
     secp256k1_scalar_set_b32(&sec, seckey, NULL);
     int overflow = 0;
     secp256k1_scalar_set_b32(&non, nonce, &overflow);
-    {
-        unsigned char c[32] = {0};
-        memcpy(c + 32 - messagelen, message, messagelen);
-        secp256k1_scalar_set_b32(&msg, c, NULL);
-        memset(c, 0, 32);
-    }
+    secp256k1_scalar_set_b32(&msg, msg32, NULL);
     int ret = !secp256k1_scalar_is_zero(&non) && !overflow;
     secp256k1_ecdsa_sig_t sig;
     if (ret) {
@@ -105,10 +96,9 @@ int secp256k1_ecdsa_sign(const unsigned char *message, int messagelen, unsigned
     return ret;
 }
 
-int secp256k1_ecdsa_sign_compact(const unsigned char *message, int messagelen, unsigned char *sig64, const unsigned char *seckey, const unsigned char *nonce, int *recid) {
+int secp256k1_ecdsa_sign_compact(const unsigned char *msg32, unsigned char *sig64, const unsigned char *seckey, const unsigned char *nonce, int *recid) {
     DEBUG_CHECK(secp256k1_ecmult_gen_consts != NULL);
-    DEBUG_CHECK(message != NULL);
-    DEBUG_CHECK(messagelen <= 32);
+    DEBUG_CHECK(msg32 != NULL);
     DEBUG_CHECK(sig64 != NULL);
     DEBUG_CHECK(seckey != NULL);
     DEBUG_CHECK(nonce != NULL);
@@ -117,12 +107,7 @@ int secp256k1_ecdsa_sign_compact(const unsigned char *message, int messagelen, u
     secp256k1_scalar_set_b32(&sec, seckey, NULL);
     int overflow = 0;
     secp256k1_scalar_set_b32(&non, nonce, &overflow);
-    {
-        unsigned char c[32] = {0};
-        memcpy(c + 32 - messagelen, message, messagelen);
-        secp256k1_scalar_set_b32(&msg, c, NULL);
-        memset(c, 0, 32);
-    }
+    secp256k1_scalar_set_b32(&msg, msg32, NULL);
     int ret = !secp256k1_scalar_is_zero(&non) && !overflow;
     secp256k1_ecdsa_sig_t sig;
     if (ret) {
@@ -138,18 +123,15 @@ int secp256k1_ecdsa_sign_compact(const unsigned char *message, int messagelen, u
     return ret;
 }
 
-int secp256k1_ecdsa_recover_compact(const unsigned char *msg, int msglen, const unsigned char *sig64, unsigned char *pubkey, int *pubkeylen, int compressed, int recid) {
+int secp256k1_ecdsa_recover_compact(const unsigned char *msg32, const unsigned char *sig64, unsigned char *pubkey, int *pubkeylen, int compressed, int recid) {
     DEBUG_CHECK(secp256k1_ecmult_consts != NULL);
-    DEBUG_CHECK(msg != NULL);
-    DEBUG_CHECK(msglen <= 32);
+    DEBUG_CHECK(msg32 != NULL);
     DEBUG_CHECK(sig64 != NULL);
     DEBUG_CHECK(pubkey != NULL);
     DEBUG_CHECK(pubkeylen != NULL);
     DEBUG_CHECK(recid >= 0 && recid <= 3);
 
     int ret = 0;
-    unsigned char msg32[32] = {0};
-    memcpy(msg32 + 32 - msglen, msg, msglen);
     secp256k1_scalar_t m;
     secp256k1_ecdsa_sig_t sig;
     int overflow = 0;
diff --git a/src/tests.c b/src/tests.c
index 78cdd67f2..7ebb19ff9 100644
--- a/src/tests.c
+++ b/src/tests.c
@@ -11,6 +11,8 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#include <time.h>
+
 #include "secp256k1.c"
 #include "testrand_impl.h"
 
@@ -46,7 +48,7 @@ void random_group_element_test(secp256k1_ge_t *ge) {
     secp256k1_fe_t fe;
     do {
         random_field_element_test(&fe);
-        if (secp256k1_ge_set_xo(ge, &fe, secp256k1_rand32() & 1))
+        if (secp256k1_ge_set_xo_var(ge, &fe, secp256k1_rand32() & 1))
             break;
     } while(1);
 }
@@ -400,6 +402,30 @@ void scalar_test(void) {
         CHECK(secp256k1_scalar_eq(&r1, &r2));
     }
 
+    {
+        /* Test multiplicative identity. */
+        secp256k1_scalar_t r1, v1;
+        secp256k1_scalar_set_int(&v1,1);
+        secp256k1_scalar_mul(&r1, &s1, &v1);
+        CHECK(secp256k1_scalar_eq(&r1, &s1));
+    }
+
+    {
+        /* Test additive identity. */
+        secp256k1_scalar_t r1, v0;
+        secp256k1_scalar_set_int(&v0,0);
+        secp256k1_scalar_add(&r1, &s1, &v0);
+        CHECK(secp256k1_scalar_eq(&r1, &s1));
+    }
+
+    {
+        /* Test zero product property. */
+        secp256k1_scalar_t r1, v0;
+        secp256k1_scalar_set_int(&v0,0);
+        secp256k1_scalar_mul(&r1, &s1, &v0);
+        CHECK(secp256k1_scalar_eq(&r1, &v0));
+    }
+
 }
 
 void run_scalar_tests(void) {
@@ -411,9 +437,12 @@ void run_scalar_tests(void) {
         /* (-1)+1 should be zero. */
         secp256k1_scalar_t s, o;
         secp256k1_scalar_set_int(&s, 1);
+        CHECK(secp256k1_scalar_is_one(&s));
         secp256k1_scalar_negate(&o, &s);
         secp256k1_scalar_add(&o, &o, &s);
         CHECK(secp256k1_scalar_is_zero(&o));
+        secp256k1_scalar_negate(&o, &o);
+        CHECK(secp256k1_scalar_is_zero(&o));
     }
 
 #ifndef USE_NUM_NONE
@@ -459,14 +488,14 @@ void random_fe_non_zero(secp256k1_fe_t *nz) {
 void random_fe_non_square(secp256k1_fe_t *ns) {
     random_fe_non_zero(ns);
     secp256k1_fe_t r;
-    if (secp256k1_fe_sqrt(&r, ns)) {
+    if (secp256k1_fe_sqrt_var(&r, ns)) {
         secp256k1_fe_negate(ns, ns, 1);
     }
 }
 
 int check_fe_equal(const secp256k1_fe_t *a, const secp256k1_fe_t *b) {
     secp256k1_fe_t an = *a; secp256k1_fe_normalize(&an);
-    secp256k1_fe_t bn = *b; secp256k1_fe_normalize(&bn);
+    secp256k1_fe_t bn = *b; secp256k1_fe_normalize_var(&bn);
     return secp256k1_fe_equal(&an, &bn);
 }
 
@@ -476,6 +505,55 @@ int check_fe_inverse(const secp256k1_fe_t *a, const secp256k1_fe_t *ai) {
     return check_fe_equal(&x, &one);
 }
 
+void run_field_misc(void) {
+    const unsigned char f32_5[32] = {
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05,
+    };
+    secp256k1_fe_t x;
+    secp256k1_fe_t y;
+    secp256k1_fe_t z;
+    secp256k1_fe_t q;
+    secp256k1_fe_t fe5;
+    CHECK(secp256k1_fe_set_b32(&fe5, f32_5));
+    for (int i=0; i<5*count; i++) {
+        random_fe(&x);
+        random_fe_non_zero(&y);
+        /* Test the fe equality and comparison operations. */
+        CHECK(secp256k1_fe_cmp_var(&x, &x) == 0);
+        CHECK(secp256k1_fe_equal(&x, &x));
+        z = x;
+        secp256k1_fe_add(&z,&y);
+        secp256k1_fe_normalize(&z);
+        /* Test the conditional move. */
+        secp256k1_fe_cmov(&z, &x, 0);
+        CHECK(secp256k1_fe_equal(&x, &z) == 0);
+        CHECK(secp256k1_fe_cmp_var(&x, &z) != 0);
+        secp256k1_fe_cmov(&y, &x, 1);
+        CHECK(secp256k1_fe_equal(&x, &y));
+        /* Test that mul_int, mul, and add agree. */
+        secp256k1_fe_add(&y, &x);
+        secp256k1_fe_add(&y, &x);
+        z = x;
+        secp256k1_fe_mul_int(&z, 3);
+        CHECK(check_fe_equal(&y, &z));
+        secp256k1_fe_add(&y, &x);
+        secp256k1_fe_add(&z, &x);
+        CHECK(check_fe_equal(&z, &y));
+        z = x;
+        secp256k1_fe_mul_int(&z, 5);
+        secp256k1_fe_mul(&q, &x, &fe5);
+        CHECK(check_fe_equal(&z, &q));
+        secp256k1_fe_negate(&x, &x, 1);
+        secp256k1_fe_add(&z, &x);
+        secp256k1_fe_add(&q, &x);
+        CHECK(check_fe_equal(&y, &z));
+        CHECK(check_fe_equal(&q, &y));
+    }
+}
+
 void run_field_inv(void) {
     secp256k1_fe_t x, xi, xii;
     for (int i=0; i<10*count; i++) {
@@ -498,23 +576,6 @@ void run_field_inv_var(void) {
     }
 }
 
-void run_field_inv_all(void) {
-    secp256k1_fe_t x[16], xi[16], xii[16];
-    /* Check it's safe to call for 0 elements */
-    secp256k1_fe_inv_all(0, xi, x);
-    for (int i=0; i<count; i++) {
-        size_t len = (secp256k1_rand32() & 15) + 1;
-        for (size_t j=0; j<len; j++)
-            random_fe_non_zero(&x[j]);
-        secp256k1_fe_inv_all(len, xi, x);
-        for (size_t j=0; j<len; j++)
-            CHECK(check_fe_inverse(&x[j], &xi[j]));
-        secp256k1_fe_inv_all(len, xii, xi);
-        for (size_t j=0; j<len; j++)
-            CHECK(check_fe_equal(&x[j], &xii[j]));
-    }
-}
-
 void run_field_inv_all_var(void) {
     secp256k1_fe_t x[16], xi[16], xii[16];
     /* Check it's safe to call for 0 elements */
@@ -549,7 +610,7 @@ void run_sqr(void) {
 
 void test_sqrt(const secp256k1_fe_t *a, const secp256k1_fe_t *k) {
     secp256k1_fe_t r1, r2;
-    int v = secp256k1_fe_sqrt(&r1, a);
+    int v = secp256k1_fe_sqrt_var(&r1, a);
     CHECK((v == 0) == (k == NULL));
 
     if (k != NULL) {
@@ -769,6 +830,7 @@ void run_ecmult_chain(void) {
 }
 
 void test_point_times_order(const secp256k1_gej_t *point) {
+    unsigned char pub[65];
     /* X * (point + G) + (order-X) * (pointer + G) = 0 */
     secp256k1_scalar_t x;
     random_scalar_order_test(&x);
@@ -779,27 +841,36 @@ void test_point_times_order(const secp256k1_gej_t *point) {
     secp256k1_ecmult(&res2, point, &nx, &nx); /* calc res2 = (order - x) * point + (order - x) * G; */
     secp256k1_gej_add_var(&res1, &res1, &res2);
     CHECK(secp256k1_gej_is_infinity(&res1));
-    CHECK(secp256k1_gej_is_valid(&res1) == 0);
+    CHECK(secp256k1_gej_is_valid_var(&res1) == 0);
     secp256k1_ge_t res3;
     secp256k1_ge_set_gej(&res3, &res1);
     CHECK(secp256k1_ge_is_infinity(&res3));
-    CHECK(secp256k1_ge_is_valid(&res3) == 0);
+    CHECK(secp256k1_ge_is_valid_var(&res3) == 0);
+    int psize = 65;
+    CHECK(secp256k1_eckey_pubkey_serialize(&res3, pub, &psize, 0) == 0);
+    psize = 65;
+    CHECK(secp256k1_eckey_pubkey_serialize(&res3, pub, &psize, 1) == 0);
 }
 
 void run_point_times_order(void) {
     secp256k1_fe_t x; VERIFY_CHECK(secp256k1_fe_set_hex(&x, "02", 2));
     for (int i=0; i<500; i++) {
         secp256k1_ge_t p;
-        if (secp256k1_ge_set_xo(&p, &x, 1)) {
-            CHECK(secp256k1_ge_is_valid(&p));
+        if (secp256k1_ge_set_xo_var(&p, &x, 1)) {
+            CHECK(secp256k1_ge_is_valid_var(&p));
             secp256k1_gej_t j;
             secp256k1_gej_set_ge(&j, &p);
-            CHECK(secp256k1_gej_is_valid(&j));
+            CHECK(secp256k1_gej_is_valid_var(&j));
             test_point_times_order(&j);
         }
         secp256k1_fe_sqr(&x, &x);
     }
-    char c[65]; int cl=65;
+    char c[65];
+    int cl = 1;
+    c[1] = 123;
+    secp256k1_fe_get_hex(c, &cl, &x); /* Check that fe_get_hex handles a too short input. */
+    CHECK(c[1] == 123);
+    cl = 65;
     secp256k1_fe_get_hex(c, &cl, &x);
     CHECK(strcmp(c, "7603CB59B0EF6C63FE6084792A0C378CDB3233A80F8A9A09A877DEAD31B38C45") == 0);
 }
@@ -894,7 +965,10 @@ void test_ecdsa_end_to_end(void) {
     /* Construct and verify corresponding public key. */
     CHECK(secp256k1_ec_seckey_verify(privkey) == 1);
     unsigned char pubkey[65]; int pubkeylen = 65;
-    CHECK(secp256k1_ec_pubkey_create(pubkey, &pubkeylen, privkey, secp256k1_rand32() % 2) == 1);
+    CHECK(secp256k1_ec_pubkey_create(pubkey, &pubkeylen, privkey, (secp256k1_rand32() & 3) != 0) == 1);
+    if (secp256k1_rand32() & 1) {
+        CHECK(secp256k1_ec_pubkey_decompress(pubkey, &pubkeylen));
+    }
     CHECK(secp256k1_ec_pubkey_verify(pubkey, pubkeylen));
 
     /* Verify private key import and export. */
@@ -935,38 +1009,96 @@ void test_ecdsa_end_to_end(void) {
     while(1) {
         unsigned char rnd[32];
         secp256k1_rand256_test(rnd);
-        if (secp256k1_ecdsa_sign(message, 32, signature, &signaturelen, privkey, rnd) == 1) {
+        if (secp256k1_ecdsa_sign(message, signature, &signaturelen, privkey, rnd) == 1) {
             break;
         }
     }
     /* Verify. */
-    CHECK(secp256k1_ecdsa_verify(message, 32, signature, signaturelen, pubkey, pubkeylen) == 1);
+    CHECK(secp256k1_ecdsa_verify(message, signature, signaturelen, pubkey, pubkeylen) == 1);
     /* Destroy signature and verify again. */
     signature[signaturelen - 1 - secp256k1_rand32() % 20] += 1 + (secp256k1_rand32() % 255);
-    CHECK(secp256k1_ecdsa_verify(message, 32, signature, signaturelen, pubkey, pubkeylen) != 1);
+    CHECK(secp256k1_ecdsa_verify(message, signature, signaturelen, pubkey, pubkeylen) != 1);
 
     /* Compact sign. */
     unsigned char csignature[64]; int recid = 0;
     while(1) {
         unsigned char rnd[32];
         secp256k1_rand256_test(rnd);
-        if (secp256k1_ecdsa_sign_compact(message, 32, csignature, privkey, rnd, &recid) == 1) {
+        if (secp256k1_ecdsa_sign_compact(message, csignature, privkey, rnd, &recid) == 1) {
             break;
         }
     }
     /* Recover. */
     unsigned char recpubkey[65]; int recpubkeylen = 0;
-    CHECK(secp256k1_ecdsa_recover_compact(message, 32, csignature, recpubkey, &recpubkeylen, pubkeylen == 33, recid) == 1);
+    CHECK(secp256k1_ecdsa_recover_compact(message, csignature, recpubkey, &recpubkeylen, pubkeylen == 33, recid) == 1);
     CHECK(recpubkeylen == pubkeylen);
     CHECK(memcmp(pubkey, recpubkey, pubkeylen) == 0);
     /* Destroy signature and verify again. */
     csignature[secp256k1_rand32() % 64] += 1 + (secp256k1_rand32() % 255);
-    CHECK(secp256k1_ecdsa_recover_compact(message, 32, csignature, recpubkey, &recpubkeylen, pubkeylen == 33, recid) != 1 ||
+    CHECK(secp256k1_ecdsa_recover_compact(message, csignature, recpubkey, &recpubkeylen, pubkeylen == 33, recid) != 1 ||
           memcmp(pubkey, recpubkey, pubkeylen) != 0);
     CHECK(recpubkeylen == pubkeylen);
 
 }
 
+void test_random_pubkeys(void) {
+    unsigned char in[65];
+    /* Generate some randomly sized pubkeys. */
+    uint32_t r = secp256k1_rand32();
+    int len = (r & 3) == 0 ? 65 : 33;
+    r>>=2;
+    if ((r & 3) == 0) len = (r & 252) >> 3;
+    r>>=8;
+    if (len == 65) {
+      in[0] = (r & 2) ? 4 : (r & 1? 6 : 7);
+    } else {
+      in[0] = (r & 1) ? 2 : 3;
+    }
+    r>>=2;
+    if ((r & 7) == 0) in[0] = (r & 2040) >> 3;
+    r>>=11;
+    if (len > 1) secp256k1_rand256(&in[1]);
+    if (len > 33) secp256k1_rand256(&in[33]);
+    secp256k1_ge_t elem;
+    secp256k1_ge_t elem2;
+    if (secp256k1_eckey_pubkey_parse(&elem, in, len)) {
+        unsigned char out[65];
+        unsigned char firstb;
+        int res;
+        int size = len;
+        firstb = in[0];
+        /* If the pubkey can be parsed, it should round-trip... */
+        CHECK(secp256k1_eckey_pubkey_serialize(&elem, out, &size, len == 33));
+        CHECK(size == len);
+        CHECK(memcmp(&in[1], &out[1], len-1) == 0);
+        /* ... except for the type of hybrid inputs. */
+        if ((in[0] != 6) && (in[0] != 7)) CHECK(in[0] == out[0]);
+        size = 65;
+        CHECK(secp256k1_eckey_pubkey_serialize(&elem, in, &size, 0));
+        CHECK(size == 65);
+        CHECK(secp256k1_eckey_pubkey_parse(&elem2, in, size));
+        CHECK(ge_equals_ge(&elem,&elem2));
+        /* Check that the X9.62 hybrid type is checked. */
+        in[0] = (r & 1) ? 6 : 7;
+        res = secp256k1_eckey_pubkey_parse(&elem2, in, size);
+        if (firstb == 2 || firstb == 3) {
+            if (in[0] == firstb + 4) CHECK(res);
+            else CHECK(!res);
+        }
+        if (res) {
+            CHECK(ge_equals_ge(&elem,&elem2));
+            CHECK(secp256k1_eckey_pubkey_serialize(&elem, out, &size, 0));
+            CHECK(memcmp(&in[1], &out[1], 64) == 0);
+        }
+    }
+}
+
+void run_random_pubkeys(void) {
+    for (int i=0; i<10*count; i++) {
+        test_random_pubkeys();
+    }
+}
+
 void run_ecdsa_end_to_end(void) {
     for (int i=0; i<64*count; i++) {
         test_ecdsa_end_to_end();
@@ -995,10 +1127,10 @@ void test_ecdsa_edge_cases(void) {
     };
     unsigned char pubkey[65];
     int pubkeylen = 65;
-    CHECK(!secp256k1_ecdsa_recover_compact(msg32, 32, sig64, pubkey, &pubkeylen, 0, 0));
-    CHECK(secp256k1_ecdsa_recover_compact(msg32, 32, sig64, pubkey, &pubkeylen, 0, 1));
-    CHECK(!secp256k1_ecdsa_recover_compact(msg32, 32, sig64, pubkey, &pubkeylen, 0, 2));
-    CHECK(!secp256k1_ecdsa_recover_compact(msg32, 32, sig64, pubkey, &pubkeylen, 0, 3));
+    CHECK(!secp256k1_ecdsa_recover_compact(msg32, sig64, pubkey, &pubkeylen, 0, 0));
+    CHECK(secp256k1_ecdsa_recover_compact(msg32, sig64, pubkey, &pubkeylen, 0, 1));
+    CHECK(!secp256k1_ecdsa_recover_compact(msg32, sig64, pubkey, &pubkeylen, 0, 2));
+    CHECK(!secp256k1_ecdsa_recover_compact(msg32, sig64, pubkey, &pubkeylen, 0, 3));
 
     /* signature (r,s) = (4,4), which can be recovered with all 4 recids. */
     const unsigned char sigb64[64] = {
@@ -1016,6 +1148,36 @@ void test_ecdsa_edge_cases(void) {
     for (int recid = 0; recid < 4; recid++) {
         /* (4,4) encoded in DER. */
         unsigned char sigbder[8] = {0x30, 0x06, 0x02, 0x01, 0x04, 0x02, 0x01, 0x04};
+        unsigned char sigcder_zr[7] = {0x30, 0x05, 0x02, 0x00, 0x02, 0x01, 0x01};
+        unsigned char sigcder_zs[7] = {0x30, 0x05, 0x02, 0x01, 0x01, 0x02, 0x00};
+        unsigned char sigbderalt1[39] = {
+            0x30, 0x25, 0x02, 0x20, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x04, 0x02, 0x01, 0x04,
+        };
+        unsigned char sigbderalt2[39] = {
+            0x30, 0x25, 0x02, 0x01, 0x04, 0x02, 0x20, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04,
+        };
+        unsigned char sigbderalt3[40] = {
+            0x30, 0x26, 0x02, 0x21, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x04, 0x02, 0x01, 0x04,
+        };
+        unsigned char sigbderalt4[40] = {
+            0x30, 0x26, 0x02, 0x01, 0x04, 0x02, 0x21, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04,
+        };
         /* (order + r,4) encoded in DER. */
         unsigned char sigbderlong[40] = {
             0x30, 0x26, 0x02, 0x21, 0x00, 0xFF, 0xFF, 0xFF,
@@ -1024,18 +1186,45 @@ void test_ecdsa_edge_cases(void) {
             0xE6, 0xAF, 0x48, 0xA0, 0x3B, 0xBF, 0xD2, 0x5E,
             0x8C, 0xD0, 0x36, 0x41, 0x45, 0x02, 0x01, 0x04
         };
-        CHECK(secp256k1_ecdsa_recover_compact(msg32, 32, sigb64, pubkeyb, &pubkeyblen, 1, recid));
-        CHECK(secp256k1_ecdsa_verify(msg32, 32, sigbder, sizeof(sigbder), pubkeyb, pubkeyblen) == 1);
+        CHECK(secp256k1_ecdsa_recover_compact(msg32, sigb64, pubkeyb, &pubkeyblen, 1, recid));
+        CHECK(secp256k1_ecdsa_verify(msg32, sigbder, sizeof(sigbder), pubkeyb, pubkeyblen) == 1);
         for (int recid2 = 0; recid2 < 4; recid2++) {
             unsigned char pubkey2b[33];
             int pubkey2blen = 33;
-            CHECK(secp256k1_ecdsa_recover_compact(msg32, 32, sigb64, pubkey2b, &pubkey2blen, 1, recid2));
+            CHECK(secp256k1_ecdsa_recover_compact(msg32, sigb64, pubkey2b, &pubkey2blen, 1, recid2));
             /* Verifying with (order + r,4) should always fail. */
-            CHECK(secp256k1_ecdsa_verify(msg32, 32, sigbderlong, sizeof(sigbderlong), pubkey2b, pubkey2blen) != 1);
+            CHECK(secp256k1_ecdsa_verify(msg32, sigbderlong, sizeof(sigbderlong), pubkey2b, pubkey2blen) != 1);
         }
+        /* DER parsing tests. */
+        /* Zero length r/s. */
+        CHECK(secp256k1_ecdsa_verify(msg32, sigcder_zr, sizeof(sigcder_zr), pubkeyb, pubkeyblen) == -2);
+        CHECK(secp256k1_ecdsa_verify(msg32, sigcder_zs, sizeof(sigcder_zs), pubkeyb, pubkeyblen) == -2);
+        /* Leading zeros. */
+        CHECK(secp256k1_ecdsa_verify(msg32, sigbderalt1, sizeof(sigbderalt1), pubkeyb, pubkeyblen) == 1);
+        CHECK(secp256k1_ecdsa_verify(msg32, sigbderalt2, sizeof(sigbderalt2), pubkeyb, pubkeyblen) == 1);
+        CHECK(secp256k1_ecdsa_verify(msg32, sigbderalt3, sizeof(sigbderalt3), pubkeyb, pubkeyblen) == 1);
+        CHECK(secp256k1_ecdsa_verify(msg32, sigbderalt4, sizeof(sigbderalt4), pubkeyb, pubkeyblen) == 1);
+        sigbderalt3[4] = 1;
+        CHECK(secp256k1_ecdsa_verify(msg32, sigbderalt3, sizeof(sigbderalt3), pubkeyb, pubkeyblen) == -2);
+        sigbderalt4[7] = 1;
+        CHECK(secp256k1_ecdsa_verify(msg32, sigbderalt4, sizeof(sigbderalt4), pubkeyb, pubkeyblen) == -2);
         /* Damage signature. */
         sigbder[7]++;
-        CHECK(secp256k1_ecdsa_verify(msg32, 32, sigbder, sizeof(sigbder), pubkeyb, pubkeyblen) == 0);
+        CHECK(secp256k1_ecdsa_verify(msg32, sigbder, sizeof(sigbder), pubkeyb, pubkeyblen) == 0);
+        sigbder[7]--;
+        CHECK(secp256k1_ecdsa_verify(msg32, sigbder, 6, pubkeyb, pubkeyblen) == -2);
+        CHECK(secp256k1_ecdsa_verify(msg32, sigbder, sizeof(sigbder)-1, pubkeyb, pubkeyblen) == -2);
+        for(int i = 0; i<8; i++) {
+            unsigned char orig = sigbder[i];
+            /*Try every single-byte change.*/
+            for (int c=0; c<256; c++) {
+                if (c == orig ) continue;
+                sigbder[i] = c;
+                CHECK(secp256k1_ecdsa_verify(msg32, sigbder, sizeof(sigbder), pubkeyb, pubkeyblen) ==
+                  (i==4 || i==7) ? 0 : -2 );
+            }
+            sigbder[i] = orig;
+        }
     }
 
     /* Test the case where ECDSA recomputes a point that is infinity. */
@@ -1069,18 +1258,60 @@ void test_ecdsa_edge_cases(void) {
         };
         unsigned char pubkeyc[65];
         int pubkeyclen = 65;
-        CHECK(secp256k1_ecdsa_recover_compact(msg32, 32, sigc64, pubkeyc, &pubkeyclen, 0, 0) == 1);
-        CHECK(secp256k1_ecdsa_verify(msg32, 32, sigcder, sizeof(sigcder), pubkeyc, pubkeyclen) == 1);
+        CHECK(secp256k1_ecdsa_recover_compact(msg32, sigc64, pubkeyc, &pubkeyclen, 0, 0) == 1);
+        CHECK(secp256k1_ecdsa_verify(msg32, sigcder, sizeof(sigcder), pubkeyc, pubkeyclen) == 1);
         sigcder[4] = 0;
         sigc64[31] = 0;
-        CHECK(secp256k1_ecdsa_recover_compact(msg32, 32, sigc64, pubkeyb, &pubkeyblen, 1, 0) == 0);
-        CHECK(secp256k1_ecdsa_verify(msg32, 32, sigcder, sizeof(sigcder), pubkeyc, pubkeyclen) == 0);
+        CHECK(secp256k1_ecdsa_recover_compact(msg32, sigc64, pubkeyb, &pubkeyblen, 1, 0) == 0);
+        CHECK(secp256k1_ecdsa_verify(msg32, sigcder, sizeof(sigcder), pubkeyc, pubkeyclen) == 0);
         sigcder[4] = 1;
         sigcder[7] = 0;
         sigc64[31] = 1;
         sigc64[63] = 0;
-        CHECK(secp256k1_ecdsa_recover_compact(msg32, 32, sigc64, pubkeyb, &pubkeyblen, 1, 0) == 0);
-        CHECK(secp256k1_ecdsa_verify(msg32, 32, sigcder, sizeof(sigcder), pubkeyc, pubkeyclen) == 0);
+        CHECK(secp256k1_ecdsa_recover_compact(msg32, sigc64, pubkeyb, &pubkeyblen, 1, 0) == 0);
+        CHECK(secp256k1_ecdsa_verify(msg32, sigcder, sizeof(sigcder), pubkeyc, pubkeyclen) == 0);
+    }
+
+    /*Signature where s would be zero.*/
+    {
+        const unsigned char nonce[32] = {
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+        };
+        const unsigned char key[32] = {
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+        };
+        unsigned char msg[32] = {
+            0x86, 0x41, 0x99, 0x81, 0x06, 0x23, 0x44, 0x53,
+            0xaa, 0x5f, 0x9d, 0x6a, 0x31, 0x78, 0xf4, 0xf7,
+            0xb8, 0x12, 0xe0, 0x0b, 0x81, 0x7a, 0x77, 0x62,
+            0x65, 0xdf, 0xdd, 0x31, 0xb9, 0x3e, 0x29, 0xa9,
+        };
+        unsigned char sig[72];
+        int siglen = 72;
+        CHECK(secp256k1_ecdsa_sign(msg, sig, &siglen, key, nonce) == 0);
+        msg[31] = 0xaa;
+        siglen = 72;
+        CHECK(secp256k1_ecdsa_sign(msg, sig, &siglen, key, nonce) == 1);
+    }
+
+    /* Privkey export where pubkey is the point at infinity. */
+    {
+        unsigned char privkey[300];
+        unsigned char seckey[32] = {
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe,
+            0xba, 0xae, 0xdc, 0xe6, 0xaf, 0x48, 0xa0, 0x3b,
+            0xbf, 0xd2, 0x5e, 0x8c, 0xd0, 0x36, 0x41, 0x41,
+        };
+        int outlen = 300;
+        CHECK(!secp256k1_ec_privkey_export(seckey, privkey, &outlen, 0));
+        CHECK(!secp256k1_ec_privkey_export(seckey, privkey, &outlen, 1));
     }
 }
 
@@ -1185,8 +1416,8 @@ int main(int argc, char **argv) {
     /* field tests */
     run_field_inv();
     run_field_inv_var();
-    run_field_inv_all();
     run_field_inv_all_var();
+    run_field_misc();
     run_sqr();
     run_sqrt();
 
@@ -1199,6 +1430,7 @@ int main(int argc, char **argv) {
     run_ecmult_chain();
 
     /* ecdsa tests */
+    run_random_pubkeys();
     run_ecdsa_sign_verify();
     run_ecdsa_end_to_end();
     run_ecdsa_edge_cases();
diff --git a/src/util.h b/src/util.h
index 08b23a9d3..c3a8f3a42 100644
--- a/src/util.h
+++ b/src/util.h
@@ -61,6 +61,12 @@
 #define VERIFY_CHECK(cond) do { (void)(cond); } while(0)
 #endif
 
+static inline void *checked_malloc(size_t size) {
+    void *ret = malloc(size);
+    CHECK(ret != NULL);
+    return ret;
+}
+
 /* Macro for restrict, when available and not in a VERIFY build. */
 #if defined(SECP256K1_BUILD) && defined(VERIFY)
 # define SECP256K1_RESTRICT