Squashed 'src/secp256k1/' changes from b0210a9..bccaf86

bccaf86 Merge pull request #150 2a53a47 Merge pull request #151 5f5a31f Merge pull request #149 3907277 Merge pull request #142 a3e0611 Enable tests in x86 travis builds 45da235 x86 builder 8bb0e93 Merge pull request #155 971fe81 build: fix openssl detection for cross builds f22d73e Explicitly access %0..%2 as 64-bit so we use the right registers for x32 ABI e66d4d6 Avoid the stack in assembly and use explicit registers cf7b2b4 Fix ECDSA message hashes to 32 bytes 056ad31 Really compile with -O3 by default 74ad63a Merge pull request #146 9000458 Merge pull request #145 1f46b00 build: fix __builtin_expect detection for clang aaba2e0 Merge pull request #136 8a0775c Merge pull request #144 ee1eaa7 Merge pull request #141 c88e2b8 Compile with -O3 by default 6558a26 Make the benchmarks print out stats 000bdf6 Rename bench_verify to bench_recovery 7c6fed2 Add a few more additional tests. 992e03b travis: add clang to the test matrix b43b79a Merge pull request #143 e06a924 Include time.h header for time(). 8d11164 Add some additional tests. 3545627 Merge pull request #118 6a9901e Merge pull request #137 376b28b Merge pull request #128 1728806 Merge pull request #138 a5759c5 Check return value of malloc 39bd94d Variable time normalize ad86bdf Merge pull request #140 54b768c Another redundant secp256k1_fe_normalize 69dcaab Merge pull request #139 1c29f2e Remove redundant secp256k1_fe_normalize from secp256k1_gej_add_ge_var. 2b9388b Remove unused secp256k1_fe_inv_all f461b76 Allocate precomputation arrays on the heap b2c9681 Make {mul,sqr}_inner use the same argument order as {mul,sqr} 6793505 Convert YASM code into inline assembly f048615 Rewrite field assembly to match the C version 3ce74b1 Tweak precomputed table size for G git-subtree-dir: src/secp256k1 git-subtree-split: bccaf86caa9c44166e5a66600b742c516e03c3f0
10 years ago · ecae2acb06
29 changed files with 1198 additions and 845 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -1,12 +1,14 @@
-language: cpp
-compiler: gcc
+language: c
+compiler:
+  - clang
+  - gcc
 install:
  - sudo apt-get install -qq libssl-dev
-  - if [ "$BIGNUM" = "gmp" -o "$BIGNUM" = "auto" -o "$FIELD" = "gmp" ]; then sudo apt-get install -qq libgmp-dev; fi
-  - if [ "$FIELD" = "64bit_asm" ]; then sudo apt-get install -qq yasm; fi
+  - if [ "$BIGNUM" = "gmp" -o "$BIGNUM" = "auto" -o "$FIELD" = "gmp" ]; then sudo apt-get install --no-install-recommends --no-upgrade -qq libgmp-dev; fi
+  - if [ -n "$EXTRAPACKAGES" ]; then sudo apt-get update && sudo apt-get install --no-install-recommends --no-upgrade $EXTRAPACKAGES; fi
 env:
  global:
-    - FIELD=auto  BIGNUM=auto  SCALAR=auto  ENDOMORPHISM=no  BUILD=check  EXTRAFLAGS=
+    - FIELD=auto  BIGNUM=auto  SCALAR=auto  ENDOMORPHISM=no  BUILD=check  EXTRAFLAGS= HOST= EXTRAPACKAGES=
  matrix:
    - SCALAR=32bit
    - SCALAR=64bit
@ -22,6 +24,11 @@ env:
    - BIGNUM=none     ENDOMORPHISM=yes
    - BUILD=distcheck
    - EXTRAFLAGS=CFLAGS=-DDETERMINISTIC
+    - HOST=i686-linux-gnu EXTRAPACKAGES="gcc-multilib"
+    - HOST=i686-linux-gnu EXTRAPACKAGES="gcc-multilib" ENDOMORPHISM=yes
 before_script: ./autogen.sh
-script: ./configure --enable-endomorphism=$ENDOMORPHISM --with-field=$FIELD --with-bignum=$BIGNUM --with-scalar=$SCALAR $EXTRAFLAGS && make -j2 $BUILD
+script:
+ - if [ -n "$HOST" ]; then export USE_HOST="--host=$HOST"; fi
+ - if [ "x$HOST" = "xi686-linux-gnu" ]; then export CC="$CC -m32"; fi
+ - ./configure --enable-endomorphism=$ENDOMORPHISM --with-field=$FIELD --with-bignum=$BIGNUM --with-scalar=$SCALAR $EXTRAFLAGS $USE_HOST && make -j2 $BUILD
 os: linux
--- a/Makefile.am
+++ b/Makefile.am
@ -1,12 +1,6 @@
 ACLOCAL_AMFLAGS = -I build-aux/m4

 lib_LTLIBRARIES = libsecp256k1.la
-if USE_ASM
-COMMON_LIB = libsecp256k1_common.la
-else
-COMMON_LIB =
-endif
-noinst_LTLIBRARIES = $(COMMON_LIB)
 include_HEADERS = include/secp256k1.h
 noinst_HEADERS =
 noinst_HEADERS += src/scalar.h
@ -43,30 +37,30 @@ noinst_HEADERS += src/field_gmp.h
 noinst_HEADERS += src/field_gmp_impl.h
 noinst_HEADERS += src/field.h
 noinst_HEADERS += src/field_impl.h
+noinst_HEADERS += src/bench.h

 pkgconfigdir = $(libdir)/pkgconfig
 pkgconfig_DATA = libsecp256k1.pc

-if USE_ASM
-libsecp256k1_common_la_SOURCES = src/field_5x52_asm.asm
-endif
-
 libsecp256k1_la_SOURCES = src/secp256k1.c
 libsecp256k1_la_CPPFLAGS = -I$(top_srcdir)/include $(SECP_INCLUDES)
-libsecp256k1_la_LIBADD = $(COMMON_LIB) $(SECP_LIBS)
+libsecp256k1_la_LIBADD = $(SECP_LIBS)


 noinst_PROGRAMS =
 if USE_BENCHMARK
-noinst_PROGRAMS += bench_verify bench_sign bench_inv
+noinst_PROGRAMS += bench_verify bench_recover bench_sign bench_inv
 bench_verify_SOURCES = src/bench_verify.c
 bench_verify_LDADD = libsecp256k1.la $(SECP_LIBS)
 bench_verify_LDFLAGS = -static
+bench_recover_SOURCES = src/bench_recover.c
+bench_recover_LDADD = libsecp256k1.la $(SECP_LIBS)
+bench_recover_LDFLAGS = -static
 bench_sign_SOURCES = src/bench_sign.c
 bench_sign_LDADD = libsecp256k1.la $(SECP_LIBS)
 bench_sign_LDFLAGS = -static
 bench_inv_SOURCES = src/bench_inv.c
-bench_inv_LDADD = $(COMMON_LIB) $(SECP_LIBS)
+bench_inv_LDADD = $(SECP_LIBS)
 bench_inv_LDFLAGS = -static
 bench_inv_CPPFLAGS = $(SECP_INCLUDES)
 endif
@ -75,15 +69,9 @@ if USE_TESTS
 noinst_PROGRAMS += tests
 tests_SOURCES = src/tests.c
 tests_CPPFLAGS = -DVERIFY $(SECP_INCLUDES) $(SECP_TEST_INCLUDES)
-tests_LDADD = $(COMMON_LIB) $(SECP_LIBS) $(SECP_TEST_LIBS)
+tests_LDADD = $(SECP_LIBS) $(SECP_TEST_LIBS)
 tests_LDFLAGS = -static
 TESTS = tests
 endif

-EXTRA_DIST = autogen.sh nasm_lt.sh
-
-#x86_64 only
-if USE_ASM
-.asm.lo:
-	$(LIBTOOL) --mode=compile --tag YASM $(srcdir)/nasm_lt.sh $(YASM) -f $(YASM_BINFMT) $(YAFLAGS) -I$(srcdir) -I. $< -o $@
-endif
+EXTRA_DIST = autogen.sh
--- a/build-aux/m4/bitcoin_secp.m4
+++ b/build-aux/m4/bitcoin_secp.m4
@ -11,38 +11,16 @@ fi

 dnl 
 AC_DEFUN([SECP_64BIT_ASM_CHECK],[
-if test x"$host_cpu" == x"x86_64"; then
-  AC_CHECK_PROG(YASM, yasm, yasm)
-else
-  if test x"$set_field" = x"64bit_asm"; then
-    AC_MSG_ERROR([$set_field field support explicitly requested but is not compatible with this host])
-  fi
-fi
-if test x$YASM = x; then
-  if test x"$set_field" = x"64bit_asm"; then
-    AC_MSG_ERROR([$set_field field support explicitly requested but yasm was not found])
-  fi
-  has_64bit_asm=no
-else
-  case x"$host_os" in
-  xdarwin*)
-    YASM_BINFMT=macho64
-    ;;
-  x*-gnux32)
-    YASM_BINFMT=elfx32
-    ;;
-  *)
-    YASM_BINFMT=elf64
-    ;;
-  esac
-  if $YASM -f help | grep -q $YASM_BINFMT; then
-    has_64bit_asm=yes
-  else
-    if test x"$set_field" = x"64bit_asm"; then
-      AC_MSG_ERROR([$set_field field support explicitly requested but yasm doesn't support $YASM_BINFMT format])
-    fi
-    AC_MSG_WARN([yasm too old for $YASM_BINFMT format])
-    has_64bit_asm=no
+AC_MSG_CHECKING(for x86_64 assembly availability)
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+  #include <stdint.h>]],[[
+  uint64_t a = 11, tmp;
+  __asm__ __volatile__("movq $0x100000000,%1; mulq %%rsi" : "+a"(a) : "S"(tmp) : "cc", "%rdx");
+  ]])],[has_64bit_asm=yes],[has_64bit_asm=no])
+AC_MSG_RESULT([$has_64bit_asm])
+if test x"$set_field" == x"64bit_asm"; then
+  if test x"$has_64bit_asm" == x"no"; then
+    AC_MSG_ERROR([$set_field field support explicitly requested but no x86_64 assembly available])
  fi
 fi
 ])
@ -52,8 +30,13 @@ AC_DEFUN([SECP_OPENSSL_CHECK],[
 if test x"$use_pkgconfig" = x"yes"; then
    : #NOP
  m4_ifdef([PKG_CHECK_MODULES],[
-    PKG_CHECK_MODULES([CRYPTO], [libcrypto], [has_libcrypto=yes; AC_DEFINE(HAVE_LIBCRYPTO,1,[Define this symbol if libcrypto is installed])],[has_libcrypto=no])
-    : #NOP
+    PKG_CHECK_MODULES([CRYPTO], [libcrypto], [has_libcrypto=yes],[has_libcrypto=no])
+    if test x"$has_libcrypto" = x"yes"; then
+      TEMP_LIBS="$LIBS"
+      LIBS="$LIBS $CRYPTO_LIBS"
+      AC_CHECK_LIB(crypto, main,[AC_DEFINE(HAVE_LIBCRYPTO,1,[Define this symbol if libcrypto is installed])],[has_libcrypto=no])
+      LIBS="$TEMP_LIBS"
+    fi
  ])
 else
  AC_CHECK_HEADER(openssl/crypto.h,[AC_CHECK_LIB(crypto, main,[has_libcrypto=yes; CRYPTO_LIBS=-lcrypto; AC_DEFINE(HAVE_LIBCRYPTO,1,[Define this symbol if libcrypto is installed])]
--- a/configure.ac
+++ b/configure.ac
@ -18,6 +18,10 @@ AC_PATH_TOOL(AR, ar)
 AC_PATH_TOOL(RANLIB, ranlib)
 AC_PATH_TOOL(STRIP, strip)

+if test "x$CFLAGS" = "x"; then
+  CFLAGS="-O3 -g"
+fi
+
 AC_PROG_CC_C99
 if test x"$ac_cv_prog_cc_c99" == x"no"; then
  AC_MSG_ERROR([c99 compiler support required])
@ -103,7 +107,11 @@ AC_ARG_WITH([scalar], [AS_HELP_STRING([--with-scalar=64bit|32bit|auto],

 AC_CHECK_TYPES([__int128])

-AC_CHECK_DECL(__builtin_expect,AC_DEFINE(HAVE_BUILTIN_EXPECT,1,[Define this symbol if __builtin_expect is available]),,)
+AC_MSG_CHECKING([for __builtin_expect])
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[void myfunc() {__builtin_expect(0,0);}]])],
+    [ AC_MSG_RESULT([yes]);AC_DEFINE(HAVE_BUILTIN_EXPECT,1,[Define this symbol if __builtin_expect is available]) ],
+    [ AC_MSG_RESULT([no])
+    ])

 if test x"$req_field" = x"auto"; then
  SECP_64BIT_ASM_CHECK
@ -283,7 +291,6 @@ AC_SUBST(SECP_INCLUDES)
 AC_SUBST(SECP_LIBS)
 AC_SUBST(SECP_TEST_LIBS)
 AC_SUBST(SECP_TEST_INCLUDES)
-AC_SUBST(YASM_BINFMT)
 AM_CONDITIONAL([USE_ASM], [test x"$set_field" == x"64bit_asm"])
 AM_CONDITIONAL([USE_TESTS], [test x"$use_tests" != x"no"])
 AM_CONDITIONAL([USE_BENCHMARK], [test x"$use_benchmark" != x"no"])
--- a/include/secp256k1.h
+++ b/include/secp256k1.h
@ -62,8 +62,7 @@ void secp256k1_stop(void);
 *           0: incorrect signature
 *          -1: invalid public key
 *          -2: invalid signature
- * In:       msg:       the message being verified (cannot be NULL)
- *           msglen:    the length of the message (at most 32)
+ * In:       msg32:     the 32-byte message hash being verified (cannot be NULL)
 *           sig:       the signature being verified (cannot be NULL)
 *           siglen:    the length of the signature
 *           pubkey:    the public key to verify with (cannot be NULL)
@ -71,19 +70,17 @@ void secp256k1_stop(void);
 * Requires starting using SECP256K1_START_VERIFY.
 */
 SECP256K1_WARN_UNUSED_RESULT int secp256k1_ecdsa_verify(
-  const unsigned char *msg,
-  int msglen,
+  const unsigned char *msg32,
  const unsigned char *sig,
  int siglen,
  const unsigned char *pubkey,
  int pubkeylen
-) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(3) SECP256K1_ARG_NONNULL(5);
+) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(2) SECP256K1_ARG_NONNULL(4);

 /** Create an ECDSA signature.
 *  Returns: 1: signature created
 *           0: nonce invalid, try another one
- *  In:      msg:    the message being signed (cannot be NULL)
- *           msglen: the length of the message being signed (at most 32)
+ *  In:      msg32:  the 32-byte message hash being signed (cannot be NULL)
 *           seckey: pointer to a 32-byte secret key (cannot be NULL, assumed to be valid)
 *           nonce:  pointer to a 32-byte nonce (cannot be NULL, generated with a cryptographic PRNG)
 *  Out:     sig:    pointer to an array where the signature will be placed (cannot be NULL)
@ -92,19 +89,17 @@ SECP256K1_WARN_UNUSED_RESULT int secp256k1_ecdsa_verify(
 * Requires starting using SECP256K1_START_SIGN.
 */
 SECP256K1_WARN_UNUSED_RESULT int secp256k1_ecdsa_sign(
-  const unsigned char *msg,
-  int msglen,
+  const unsigned char *msg32,
  unsigned char *sig,
  int *siglen,
  const unsigned char *seckey,
  const unsigned char *nonce
-) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(3) SECP256K1_ARG_NONNULL(4) SECP256K1_ARG_NONNULL(5) SECP256K1_ARG_NONNULL(6);
+) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(2) SECP256K1_ARG_NONNULL(3) SECP256K1_ARG_NONNULL(4) SECP256K1_ARG_NONNULL(5);

 /** Create a compact ECDSA signature (64 byte + recovery id).
 *  Returns: 1: signature created
 *           0: nonce invalid, try another one
- *  In:      msg:    the message being signed (cannot be NULL)
- *           msglen: the length of the message being signed (at most 32)
+ *  In:      msg32:  the 32-byte message hash being signed (cannot be NULL)
 *           seckey: pointer to a 32-byte secret key (cannot be NULL, assumed to be valid)
 *           nonce:  pointer to a 32-byte nonce (cannot be NULL, generated with a cryptographic PRNG)
 *  Out:     sig:    pointer to a 64-byte array where the signature will be placed (cannot be NULL)
@ -112,19 +107,17 @@ SECP256K1_WARN_UNUSED_RESULT int secp256k1_ecdsa_sign(
 * Requires starting using SECP256K1_START_SIGN.
 */
 SECP256K1_WARN_UNUSED_RESULT int secp256k1_ecdsa_sign_compact(
-  const unsigned char *msg,
-  int msglen,
+  const unsigned char *msg32,
  unsigned char *sig64,
  const unsigned char *seckey,
  const unsigned char *nonce,
  int *recid
-) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(3) SECP256K1_ARG_NONNULL(4) SECP256K1_ARG_NONNULL(5);
+) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(2) SECP256K1_ARG_NONNULL(3) SECP256K1_ARG_NONNULL(4);

 /** Recover an ECDSA public key from a compact signature.
 *  Returns: 1: public key successfully recovered (which guarantees a correct signature).
 *           0: otherwise.
- *  In:      msg:        the message assumed to be signed (cannot be NULL)
- *           msglen:     the length of the message (at most 32)
+ *  In:      msg32:      the 32-byte message hash assumed to be signed (cannot be NULL)
 *           sig64:      signature as 64 byte array (cannot be NULL)
 *           compressed: whether to recover a compressed or uncompressed pubkey
 *           recid:      the recovery id (0-3, as returned by ecdsa_sign_compact)
@ -133,14 +126,13 @@ SECP256K1_WARN_UNUSED_RESULT int secp256k1_ecdsa_sign_compact(
 * Requires starting using SECP256K1_START_VERIFY.
 */
 SECP256K1_WARN_UNUSED_RESULT int secp256k1_ecdsa_recover_compact(
-  const unsigned char *msg,
-  int msglen,
+  const unsigned char *msg32,
  const unsigned char *sig64,
  unsigned char *pubkey,
  int *pubkeylen,
  int compressed,
  int recid
-) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(3) SECP256K1_ARG_NONNULL(4) SECP256K1_ARG_NONNULL(5);
+) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(2) SECP256K1_ARG_NONNULL(3) SECP256K1_ARG_NONNULL(4);

 /** Verify an ECDSA secret key.
 *  Returns: 1: secret key is valid
--- a/nasm_lt.sh
+++ b/nasm_lt.sh
@ -1,57 +0,0 @@
-#! /bin/sh
-command=""
-infile=""
-o_opt=no
-pic=no
-while [ $# -gt 0 ]; do
-    case "$1" in
-        -DPIC|-fPIC|-fpic|-Kpic|-KPIC)
-            if [ "$pic" != "yes" ] ; then
-                command="$command -DPIC"
-                pic=yes
-            fi
-            ;;
-        -f|-fbin|-faout|-faoutb|-fcoff|-felf|-felf64|-fas86| \
-        -fobj|-fwin32|-fwin64|-frdf|-fieee|-fmacho|-fmacho64)
-            # it's a file format specifier for nasm.
-            command="$command $1"
-            ;;
-        -f*)
-            # maybe a code-generation flag for gcc.
-            ;;
-        -[Ii]*)
-            incdir=`echo "$1" | sed 's/^-[Ii]//'`
-            if [ "x$incdir" = x -a "x$2" != x ] ; then
-                case "$2" in
-                    -*) ;;
-                    *) incdir="$2"; shift;;
-                esac
-            fi
-            if [ "x$incdir" != x ] ; then
-                # In the case of NASM, the trailing slash is necessary.
-                incdir=`echo "$incdir" | sed 's%/*$%/%'`
-                command="$command -I$incdir"
-            fi
-            ;;
-        -o*)
-            o_opt=yes
-            command="$command $1"
-            ;;
-        *.asm)
-            infile=$1
-            command="$command $1"
-            ;;
-        *)
-            command="$command $1"
-            ;;
-    esac
-    shift
-done
-if [ "$o_opt" != yes ] ; then
-    # By default, NASM creates an output file
-    # in the same directory as the input file.
-    outfile="-o `echo $infile | sed -e 's%^.*/%%' -e 's%\.[^.]*$%%'`.o"
-    command="$command $outfile"
-fi
-echo $command
-exec $command
--- a/src/bench.h
+++ b/src/bench.h
@ -0,0 +1,37 @@
+/**********************************************************************
+ * Copyright (c) 2014 Pieter Wuille                                   *
+ * Distributed under the MIT software license, see the accompanying   *
+ * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
+ **********************************************************************/
+
+#ifndef _SECP256K1_BENCH_H_
+#define _SECP256K1_BENCH_H_
+
+#include <stdio.h>
+#include <math.h>
+#include "sys/time.h"
+
+static double gettimedouble(void) {
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return tv.tv_usec * 0.000001 + tv.tv_sec;
+}
+
+void run_benchmark(void (*benchmark)(void*), void (*setup)(void*), void (*teardown)(void*), void* data, int count, int iter) {
+    double min = HUGE_VAL;
+    double sum = 0.0;
+    double max = 0.0;
+    for (int i = 0; i < count; i++) {
+        if (setup) setup(data);
+        double begin = gettimedouble();
+        benchmark(data);
+        double total = gettimedouble() - begin;
+        if (teardown) teardown(data);
+        if (total < min) min = total;
+        if (total > max) max = total;
+        sum += total;
+    }
+    printf("min %.3fus / avg %.3fus / max %.3fus\n", min * 1000000.0 / iter, (sum / count) * 1000000.0 / iter, max * 1000000.0 / iter);
+}
+
+#endif
--- a/src/bench_inv.c
+++ b/src/bench_inv.c
@ -12,30 +12,41 @@
 #include "field_impl.h"
 #include "group_impl.h"
 #include "scalar_impl.h"
+#include "bench.h"
+
+typedef struct {
+    secp256k1_scalar_t base, x;
+} bench_inv_t;
+
+void bench_inv_setup(void* arg) {
+    bench_inv_t *data = (bench_inv_t*)arg;

-int main(void) {
    static const unsigned char init[32] = {
        0x02, 0x03, 0x05, 0x07, 0x0b, 0x0d, 0x11, 0x13,
        0x17, 0x1d, 0x1f, 0x25, 0x29, 0x2b, 0x2f, 0x35,
        0x3b, 0x3d, 0x43, 0x47, 0x49, 0x4f, 0x53, 0x59,
        0x61, 0x65, 0x67, 0x6b, 0x6d, 0x71, 0x7f, 0x83
    };
-    static const unsigned char fini[32] = {
-        0xba, 0x28, 0x58, 0xd8, 0xaa, 0x11, 0xd6, 0xf2,
-        0xfa, 0xce, 0x50, 0xb1, 0x67, 0x19, 0xb1, 0xa6,
-        0xe0, 0xaa, 0x84, 0x53, 0xf6, 0x80, 0xfc, 0x23,
-        0x88, 0x3c, 0xd6, 0x74, 0x9f, 0x27, 0x09, 0x03
-    };
-    secp256k1_ge_start();
-    secp256k1_scalar_t base, x;
-    secp256k1_scalar_set_b32(&base, init, NULL);
-    secp256k1_scalar_set_b32(&x, init, NULL);
-    for (int i=0; i<1000000; i++) {
-        secp256k1_scalar_inverse(&x, &x);
-        secp256k1_scalar_add(&x, &x, &base);
+
+    secp256k1_scalar_set_b32(&data->base, init, NULL);
+    secp256k1_scalar_set_b32(&data->x, init, NULL);
+}
+
+void bench_inv(void* arg) {
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (int i=0; i<20000; i++) {
+        secp256k1_scalar_inverse(&data->x, &data->x);
+        secp256k1_scalar_add(&data->x, &data->x, &data->base);
    }
-    unsigned char res[32];
-    secp256k1_scalar_get_b32(res, &x);
-    CHECK(memcmp(res, fini, 32) == 0);
+}
+
+int main(void) {
+    secp256k1_ge_start();
+
+    bench_inv_t data;
+    run_benchmark(bench_inv, bench_inv_setup, NULL, &data, 10, 20000);
+
+    secp256k1_ge_stop();
    return 0;
 }
--- a/src/bench_recover.c
+++ b/src/bench_recover.c
@ -0,0 +1,46 @@
+/**********************************************************************
+ * Copyright (c) 2014 Pieter Wuille                                   *
+ * Distributed under the MIT software license, see the accompanying   *
+ * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
+ **********************************************************************/
+
+#include "include/secp256k1.h"
+#include "util.h"
+#include "bench.h"
+
+typedef struct {
+    unsigned char msg[32];
+    unsigned char sig[64];
+} bench_recover_t;
+
+void bench_recover(void* arg) {
+    bench_recover_t *data = (bench_recover_t*)arg;
+
+    unsigned char pubkey[33];
+    for (int i=0; i<20000; i++) {
+        int pubkeylen = 33;
+        CHECK(secp256k1_ecdsa_recover_compact(data->msg, data->sig, pubkey, &pubkeylen, 1, i % 2));
+        for (int j = 0; j < 32; j++) {
+            data->sig[j + 32] = data->msg[j];    /* Move former message to S. */
+            data->msg[j] = data->sig[j];         /* Move former R to message. */
+            data->sig[j] = pubkey[j + 1];        /* Move recovered pubkey X coordinate to R (which must be a valid X coordinate). */
+        }
+    }
+}
+
+void bench_recover_setup(void* arg) {
+    bench_recover_t *data = (bench_recover_t*)arg;
+
+    for (int i = 0; i < 32; i++) data->msg[i] = 1 + i;
+    for (int i = 0; i < 64; i++) data->sig[i] = 65 + i;
+}
+
+int main(void) {
+    secp256k1_start(SECP256K1_START_VERIFY);
+
+    bench_recover_t data;
+    run_benchmark(bench_recover, bench_recover_setup, NULL, &data, 10, 20000);
+
+    secp256k1_stop();
+    return 0;
+}
--- a/src/bench_sign.c
+++ b/src/bench_sign.c
@ -3,46 +3,45 @@
 * Distributed under the MIT software license, see the accompanying   *
 * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
 **********************************************************************/
-#include <stdio.h>
-#include <string.h>

 #include "include/secp256k1.h"
 #include "util.h"
+#include "bench.h"

-int main(void) {
-    secp256k1_start(SECP256K1_START_SIGN);
-
+typedef struct {
    unsigned char msg[32];
    unsigned char nonce[32];
    unsigned char key[32];
+} bench_sign_t;

-    for (int i = 0; i < 32; i++) msg[i] = i + 1;
-    for (int i = 0; i < 32; i++) nonce[i] = i + 33;
-    for (int i = 0; i < 32; i++) key[i] = i + 65;
+static void bench_sign_setup(void* arg) {
+    bench_sign_t *data = (bench_sign_t*)arg;

-    unsigned char sig[64];
+    for (int i = 0; i < 32; i++) data->msg[i] = i + 1;
+    for (int i = 0; i < 32; i++) data->nonce[i] = i + 33;
+    for (int i = 0; i < 32; i++) data->key[i] = i + 65;
+}
+
+static void bench_sign(void* arg) {
+    bench_sign_t *data = (bench_sign_t*)arg;

-    for (int i=0; i<1000000; i++) {
+    unsigned char sig[64];
+    for (int i=0; i<20000; i++) {
        int recid = 0;
-        CHECK(secp256k1_ecdsa_sign_compact(msg, 32, sig, key, nonce, &recid));
+        CHECK(secp256k1_ecdsa_sign_compact(data->msg, sig, data->key, data->nonce, &recid));
        for (int j = 0; j < 32; j++) {
-            nonce[j] = key[j];     /* Move former key to nonce  */
-            msg[j] = sig[j];       /* Move former R to message. */
-            key[j] = sig[j + 32];  /* Move former S to key.     */
+            data->nonce[j] = data->key[j];     /* Move former key to nonce  */
+            data->msg[j] = sig[j];             /* Move former R to message. */
+            data->key[j] = sig[j + 32];        /* Move former S to key.     */
        }
    }
+}
+
+int main(void) {
+    secp256k1_start(SECP256K1_START_SIGN);

-    static const unsigned char fini[64] = {
-        0x92, 0x03, 0xef, 0xf1, 0x58, 0x0b, 0x49, 0x8d,
-        0x22, 0x3d, 0x49, 0x0e, 0xbf, 0x26, 0x50, 0x0e,
-        0x2d, 0x62, 0x90, 0xd7, 0x82, 0xbd, 0x3d, 0x5c,
-        0xa9, 0x10, 0xa5, 0x49, 0xb1, 0xd8, 0x8c, 0xc0,
-        0x5b, 0x5e, 0x9e, 0x68, 0x51, 0x3d, 0xe8, 0xec,
-        0x82, 0x30, 0x82, 0x88, 0x8c, 0xfd, 0xe7, 0x71,
-        0x15, 0x92, 0xfc, 0x14, 0x59, 0x78, 0x31, 0xb3,
-        0xf6, 0x07, 0x91, 0x18, 0x00, 0x8d, 0x4c, 0xb2
-    };
-    CHECK(memcmp(sig, fini, 64) == 0);
+    bench_sign_t data;
+    run_benchmark(bench_sign, bench_sign_setup, NULL, &data, 10, 20000);

    secp256k1_stop();
    return 0;
--- a/src/bench_verify.c
+++ b/src/bench_verify.c
@ -9,35 +9,46 @@

 #include "include/secp256k1.h"
 #include "util.h"
+#include "bench.h"

-int main(void) {
-    secp256k1_start(SECP256K1_START_VERIFY);
-
+typedef struct {
    unsigned char msg[32];
-    unsigned char sig[64];
-
-    for (int i = 0; i < 32; i++) msg[i] = 1 + i;
-    for (int i = 0; i < 64; i++) sig[i] = 65 + i;
-
+    unsigned char key[32];
+    unsigned char nonce[32];
+    unsigned char sig[72];
+    int siglen;
    unsigned char pubkey[33];
-    for (int i=0; i<1000000; i++) {
-        int pubkeylen = 33;
-        CHECK(secp256k1_ecdsa_recover_compact(msg, 32, sig, pubkey, &pubkeylen, 1, i % 2));
-        for (int j = 0; j < 32; j++) {
-            sig[j + 32] = msg[j];    /* Move former message to S. */
-            msg[j] = sig[j];         /* Move former R to message. */
-            sig[j] = pubkey[j + 1];  /* Move recovered pubkey X coordinate to R (which must be a valid X coordinate). */
-        }
+    int pubkeylen;
+} benchmark_verify_t;
+
+static void benchmark_verify(void* arg) {
+    benchmark_verify_t* data = (benchmark_verify_t*)arg;
+
+    for (int i=0; i<20000; i++) {
+        data->sig[data->siglen - 1] ^= (i & 0xFF);
+        data->sig[data->siglen - 2] ^= ((i >> 8) & 0xFF);
+        data->sig[data->siglen - 3] ^= ((i >> 16) & 0xFF);
+        CHECK(secp256k1_ecdsa_verify(data->msg, data->sig, data->siglen, data->pubkey, data->pubkeylen) == (i == 0));
+        data->sig[data->siglen - 1] ^= (i & 0xFF);
+        data->sig[data->siglen - 2] ^= ((i >> 8) & 0xFF);
+        data->sig[data->siglen - 3] ^= ((i >> 16) & 0xFF);
    }
+}
+
+int main(void) {
+    secp256k1_start(SECP256K1_START_VERIFY | SECP256K1_START_SIGN);
+
+    benchmark_verify_t data;
+
+    for (int i = 0; i < 32; i++) data.msg[i] = 1 + i;
+    for (int i = 0; i < 32; i++) data.key[i] = 33 + i;
+    for (int i = 0; i < 32; i++) data.nonce[i] = 65 + i;
+    data.siglen = 72;
+    CHECK(secp256k1_ecdsa_sign(data.msg, data.sig, &data.siglen, data.key, data.nonce));
+    data.pubkeylen = 33;
+    CHECK(secp256k1_ec_pubkey_create(data.pubkey, &data.pubkeylen, data.key, 1));

-    static const unsigned char fini[33] = {
-        0x02,
-        0x52, 0x63, 0xae, 0x9a, 0x9d, 0x47, 0x1f, 0x1a,
-        0xb2, 0x36, 0x65, 0x89, 0x11, 0xe7, 0xcc, 0x86,
-        0xa3, 0xab, 0x97, 0xb6, 0xf1, 0xaf, 0xfd, 0x8f,
-        0x9b, 0x38, 0xb6, 0x18, 0x55, 0xe5, 0xc2, 0x43
-    };
-    CHECK(memcmp(fini, pubkey, 33) == 0);
+    run_benchmark(benchmark_verify, NULL, NULL, &data, 10, 20000);

    secp256k1_stop();
    return 0;
--- a/src/ecdsa_impl.h
+++ b/src/ecdsa_impl.h
@ -27,7 +27,7 @@ static void secp256k1_ecdsa_start(void) {
        return;

    /* Allocate. */
-    secp256k1_ecdsa_consts_t *ret = (secp256k1_ecdsa_consts_t*)malloc(sizeof(secp256k1_ecdsa_consts_t));
+    secp256k1_ecdsa_consts_t *ret = (secp256k1_ecdsa_consts_t*)checked_malloc(sizeof(secp256k1_ecdsa_consts_t));

    static const unsigned char order[] = {
        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
@ -38,7 +38,7 @@ static void secp256k1_ecdsa_start(void) {

    secp256k1_fe_set_b32(&ret->order_as_fe, order);
    secp256k1_fe_negate(&ret->p_minus_order, &ret->order_as_fe, 1);
-    secp256k1_fe_normalize(&ret->p_minus_order);
+    secp256k1_fe_normalize_var(&ret->p_minus_order);

    /* Set the global pointer. */
    secp256k1_ecdsa_consts = ret;
@ -122,7 +122,7 @@ static int secp256k1_ecdsa_sig_recompute(secp256k1_scalar_t *r2, const secp256k1
    secp256k1_gej_t pr; secp256k1_ecmult(&pr, &pubkeyj, &u2, &u1);
    if (!secp256k1_gej_is_infinity(&pr)) {
        secp256k1_fe_t xr; secp256k1_gej_get_x_var(&xr, &pr);
-        secp256k1_fe_normalize(&xr);
+        secp256k1_fe_normalize_var(&xr);
        unsigned char xrb[32]; secp256k1_fe_get_b32(xrb, &xr);
        secp256k1_scalar_set_b32(r2, xrb, NULL);
        ret = 1;
@ -144,7 +144,7 @@ static int secp256k1_ecdsa_sig_recover(const secp256k1_ecdsa_sig_t *sig, secp256
        secp256k1_fe_add(&fx, &secp256k1_ecdsa_consts->order_as_fe);
    }
    secp256k1_ge_t x;
-    if (!secp256k1_ge_set_xo(&x, &fx, recid & 1))
+    if (!secp256k1_ge_set_xo_var(&x, &fx, recid & 1))
        return 0;
    secp256k1_gej_t xj;
    secp256k1_gej_set_ge(&xj, &x);
--- a/src/eckey_impl.h
+++ b/src/eckey_impl.h
@ -17,7 +17,7 @@
 static int secp256k1_eckey_pubkey_parse(secp256k1_ge_t *elem, const unsigned char *pub, int size) {
    if (size == 33 && (pub[0] == 0x02 || pub[0] == 0x03)) {
        secp256k1_fe_t x;
-        return secp256k1_fe_set_b32(&x, pub+1) && secp256k1_ge_set_xo(elem, &x, pub[0] == 0x03);
+        return secp256k1_fe_set_b32(&x, pub+1) && secp256k1_ge_set_xo_var(elem, &x, pub[0] == 0x03);
    } else if (size == 65 && (pub[0] == 0x04 || pub[0] == 0x06 || pub[0] == 0x07)) {
        secp256k1_fe_t x, y;
        if (!secp256k1_fe_set_b32(&x, pub+1) || !secp256k1_fe_set_b32(&y, pub+33)) {
@ -26,7 +26,7 @@ static int secp256k1_eckey_pubkey_parse(secp256k1_ge_t *elem, const unsigned cha
        secp256k1_ge_set_xy(elem, &x, &y);
        if ((pub[0] == 0x06 || pub[0] == 0x07) && secp256k1_fe_is_odd(&y) != (pub[0] == 0x07))
            return 0;
-        return secp256k1_ge_is_valid(elem);
+        return secp256k1_ge_is_valid_var(elem);
    } else {
        return 0;
    }
@ -36,8 +36,8 @@ static int secp256k1_eckey_pubkey_serialize(secp256k1_ge_t *elem, unsigned char
    if (secp256k1_ge_is_infinity(elem)) {
        return 0;
    }
-    secp256k1_fe_normalize(&elem->x);
-    secp256k1_fe_normalize(&elem->y);
+    secp256k1_fe_normalize_var(&elem->x);
+    secp256k1_fe_normalize_var(&elem->y);
    secp256k1_fe_get_b32(&pub[1], &elem->x);
    if (compressed) {
        *size = 33;
--- a/src/ecmult_gen_impl.h
+++ b/src/ecmult_gen_impl.h
@ -34,7 +34,7 @@ static void secp256k1_ecmult_gen_start(void) {
        return;

    /* Allocate the precomputation table. */
-    secp256k1_ecmult_gen_consts_t *ret = (secp256k1_ecmult_gen_consts_t*)malloc(sizeof(secp256k1_ecmult_gen_consts_t));
+    secp256k1_ecmult_gen_consts_t *ret = (secp256k1_ecmult_gen_consts_t*)checked_malloc(sizeof(secp256k1_ecmult_gen_consts_t));

    /* get the generator */
    const secp256k1_ge_t *g = &secp256k1_ge_consts->g;
@ -47,7 +47,7 @@ static void secp256k1_ecmult_gen_start(void) {
        secp256k1_fe_t nums_x;
        VERIFY_CHECK(secp256k1_fe_set_b32(&nums_x, nums_b32));
        secp256k1_ge_t nums_ge;
-        VERIFY_CHECK(secp256k1_ge_set_xo(&nums_ge, &nums_x, 0));
+        VERIFY_CHECK(secp256k1_ge_set_xo_var(&nums_ge, &nums_x, 0));
        secp256k1_gej_set_ge(&nums_gej, &nums_ge);
        /* Add G to make the bits in x uniformly distributed. */
        secp256k1_gej_add_ge_var(&nums_gej, &nums_gej, g);
@ -73,7 +73,7 @@ static void secp256k1_ecmult_gen_start(void) {
            secp256k1_gej_double_var(&numsbase, &numsbase);
            if (j == 62) {
                /* In the last iteration, numsbase is (1 - 2^j) * nums instead. */
-                secp256k1_gej_neg(&numsbase, &numsbase);
+                secp256k1_gej_neg_var(&numsbase, &numsbase);
                secp256k1_gej_add_var(&numsbase, &numsbase, &nums_gej);
            }
        }
--- a/src/ecmult_impl.h
+++ b/src/ecmult_impl.h
@ -15,11 +15,13 @@
 #define WINDOW_A 5

 /** larger numbers may result in slightly better performance, at the cost of
-    exponentially larger precomputed tables. WINDOW_G == 14 results in 640 KiB. */
+    exponentially larger precomputed tables. */
 #ifdef USE_ENDOMORPHISM
-#define WINDOW_G 14
-#else
+/** Two tables for window size 15: 1.375 MiB. */
 #define WINDOW_G 15
+#else
+/** One table for window size 16: 1.375 MiB. */
+#define WINDOW_G 16
 #endif

 /** Fill a table 'pre' with precomputed odd multiples of a. W determines the size of the table.
@ -43,13 +45,14 @@ static void secp256k1_ecmult_table_precomp_gej_var(secp256k1_gej_t *pre, const s

 static void secp256k1_ecmult_table_precomp_ge_var(secp256k1_ge_t *pre, const secp256k1_gej_t *a, int w) {
    const int table_size = 1 << (w-2);
-    secp256k1_gej_t prej[table_size];
+    secp256k1_gej_t *prej = checked_malloc(sizeof(secp256k1_gej_t) * table_size);
    prej[0] = *a;
    secp256k1_gej_t d; secp256k1_gej_double_var(&d, a);
    for (int i=1; i<table_size; i++) {
        secp256k1_gej_add_var(&prej[i], &d, &prej[i-1]);
    }
    secp256k1_ge_set_all_gej_var(table_size, pre, prej);
+    free(prej);
 }

 /** The number of entries a table with precomputed multiples needs to have. */
@ -67,8 +70,8 @@ static void secp256k1_ecmult_table_precomp_ge_var(secp256k1_ge_t *pre, const sec
        (neg)((r), &(pre)[(-(n)-1)/2]); \
 } while(0)

-#define ECMULT_TABLE_GET_GEJ(r,pre,n,w) ECMULT_TABLE_GET((r),(pre),(n),(w),secp256k1_gej_neg)
-#define ECMULT_TABLE_GET_GE(r,pre,n,w)  ECMULT_TABLE_GET((r),(pre),(n),(w),secp256k1_ge_neg)
+#define ECMULT_TABLE_GET_GEJ(r,pre,n,w) ECMULT_TABLE_GET((r),(pre),(n),(w),secp256k1_gej_neg_var)
+#define ECMULT_TABLE_GET_GE(r,pre,n,w)  ECMULT_TABLE_GET((r),(pre),(n),(w),secp256k1_ge_neg_var)

 typedef struct {
    /* For accelerating the computation of a*P + b*G: */
@ -85,7 +88,7 @@ static void secp256k1_ecmult_start(void) {
        return;

    /* Allocate the precomputation table. */
-    secp256k1_ecmult_consts_t *ret = (secp256k1_ecmult_consts_t*)malloc(sizeof(secp256k1_ecmult_consts_t));
+    secp256k1_ecmult_consts_t *ret = (secp256k1_ecmult_consts_t*)checked_malloc(sizeof(secp256k1_ecmult_consts_t));

    /* get the generator */
    const secp256k1_ge_t *g = &secp256k1_ge_consts->g;
--- a/src/field.h
+++ b/src/field.h
@ -50,6 +50,9 @@ static void secp256k1_fe_stop(void);
 /** Normalize a field element. */
 static void secp256k1_fe_normalize(secp256k1_fe_t *r);

+/** Normalize a field element, without constant-time guarantee. */
+static void secp256k1_fe_normalize_var(secp256k1_fe_t *r);
+
 /** Set a field element equal to a small integer. Resulting field element is normalized. */
 static void secp256k1_fe_set_int(secp256k1_fe_t *r, int a);

@ -93,7 +96,7 @@ static void secp256k1_fe_sqr(secp256k1_fe_t *r, const secp256k1_fe_t *a);
 /** Sets a field element to be the (modular) square root (if any exist) of another. Requires the
 *  input's magnitude to be at most 8. The output magnitude is 1 (but not guaranteed to be
 *  normalized). Return value indicates whether a square root was found. */
-static int secp256k1_fe_sqrt(secp256k1_fe_t *r, const secp256k1_fe_t *a);
+static int secp256k1_fe_sqrt_var(secp256k1_fe_t *r, const secp256k1_fe_t *a);

 /** Sets a field element to be the (modular) inverse of another. Requires the input's magnitude to be
 *  at most 8. The output magnitude is 1 (but not guaranteed to be normalized). */
@ -105,9 +108,6 @@ static void secp256k1_fe_inv_var(secp256k1_fe_t *r, const secp256k1_fe_t *a);
 /** Calculate the (modular) inverses of a batch of field elements. Requires the inputs' magnitudes to be
 *  at most 8. The output magnitudes are 1 (but not guaranteed to be normalized). The inputs and
 *  outputs must not overlap in memory. */
-static void secp256k1_fe_inv_all(size_t len, secp256k1_fe_t r[len], const secp256k1_fe_t a[len]);
-
-/** Potentially faster version of secp256k1_fe_inv_all, without constant-time guarantee. */
 static void secp256k1_fe_inv_all_var(size_t len, secp256k1_fe_t r[len], const secp256k1_fe_t a[len]);

 /** Convert a field element to a hexadecimal string. */
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@ -103,6 +103,62 @@ static void secp256k1_fe_normalize(secp256k1_fe_t *r) {
 #endif
 }

+static void secp256k1_fe_normalize_var(secp256k1_fe_t *r) {
+    uint32_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4],
+             t5 = r->n[5], t6 = r->n[6], t7 = r->n[7], t8 = r->n[8], t9 = r->n[9];
+
+    /* Reduce t9 at the start so there will be at most a single carry from the first pass */
+    uint32_t x = t9 >> 22; t9 &= 0x03FFFFFUL;
+    uint32_t m;
+
+    /* The first pass ensures the magnitude is 1, ... */
+    t0 += x * 0x3D1UL; t1 += (x << 6);
+    t1 += (t0 >> 26); t0 &= 0x3FFFFFFUL;
+    t2 += (t1 >> 26); t1 &= 0x3FFFFFFUL;
+    t3 += (t2 >> 26); t2 &= 0x3FFFFFFUL; m = t2;
+    t4 += (t3 >> 26); t3 &= 0x3FFFFFFUL; m &= t3;
+    t5 += (t4 >> 26); t4 &= 0x3FFFFFFUL; m &= t4;
+    t6 += (t5 >> 26); t5 &= 0x3FFFFFFUL; m &= t5;
+    t7 += (t6 >> 26); t6 &= 0x3FFFFFFUL; m &= t6;
+    t8 += (t7 >> 26); t7 &= 0x3FFFFFFUL; m &= t7;
+    t9 += (t8 >> 26); t8 &= 0x3FFFFFFUL; m &= t8;
+
+    /* ... except for a possible carry at bit 22 of t9 (i.e. bit 256 of the field element) */
+    VERIFY_CHECK(t9 >> 23 == 0);
+
+    /* At most a single final reduction is needed; check if the value is >= the field characteristic */
+    x = (t9 >> 22) | ((t9 == 0x03FFFFFUL) & (m == 0x3FFFFFFUL)
+        & ((t1 + 0x40UL + ((t0 + 0x3D1UL) >> 26)) > 0x3FFFFFFUL));
+
+    if (x) {
+        t0 += 0x3D1UL; t1 += (x << 6);
+        t1 += (t0 >> 26); t0 &= 0x3FFFFFFUL;
+        t2 += (t1 >> 26); t1 &= 0x3FFFFFFUL;
+        t3 += (t2 >> 26); t2 &= 0x3FFFFFFUL;
+        t4 += (t3 >> 26); t3 &= 0x3FFFFFFUL;
+        t5 += (t4 >> 26); t4 &= 0x3FFFFFFUL;
+        t6 += (t5 >> 26); t5 &= 0x3FFFFFFUL;
+        t7 += (t6 >> 26); t6 &= 0x3FFFFFFUL;
+        t8 += (t7 >> 26); t7 &= 0x3FFFFFFUL;
+        t9 += (t8 >> 26); t8 &= 0x3FFFFFFUL;
+
+        /* If t9 didn't carry to bit 22 already, then it should have after any final reduction */
+        VERIFY_CHECK(t9 >> 22 == x);
+
+        /* Mask off the possible multiple of 2^256 from the final reduction */
+        t9 &= 0x03FFFFFUL;
+    }
+
+    r->n[0] = t0; r->n[1] = t1; r->n[2] = t2; r->n[3] = t3; r->n[4] = t4;
+    r->n[5] = t5; r->n[6] = t6; r->n[7] = t7; r->n[8] = t8; r->n[9] = t9;
+
+#ifdef VERIFY
+    r->magnitude = 1;
+    r->normalized = 1;
+    secp256k1_fe_verify(r);
+#endif
+}
+
 SECP256K1_INLINE static void secp256k1_fe_set_int(secp256k1_fe_t *r, int a) {
    r->n[0] = a;
    r->n[1] = r->n[2] = r->n[3] = r->n[4] = r->n[5] = r->n[6] = r->n[7] = r->n[8] = r->n[9] = 0;
@ -271,7 +327,7 @@ SECP256K1_INLINE static void secp256k1_fe_add(secp256k1_fe_t *r, const secp256k1
 #define VERIFY_BITS(x, n) do { } while(0)
 #endif

-SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uint32_t * SECP256K1_RESTRICT b, uint32_t *r) {
+SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t *a, const uint32_t * SECP256K1_RESTRICT b) {
    VERIFY_BITS(a[0], 30);
    VERIFY_BITS(a[1], 30);
    VERIFY_BITS(a[2], 30);
@ -598,7 +654,7 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint32_t *a, const uin
    /* [r9 r8 r7 r6 r5 r4 r3 r2 r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
 }

-SECP256K1_INLINE static void secp256k1_fe_sqr_inner(const uint32_t *a, uint32_t *r) {
+SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t *a) {
    VERIFY_BITS(a[0], 30);
    VERIFY_BITS(a[1], 30);
    VERIFY_BITS(a[2], 30);
@ -879,7 +935,7 @@ static void secp256k1_fe_mul(secp256k1_fe_t *r, const secp256k1_fe_t *a, const s
    secp256k1_fe_verify(b);
    VERIFY_CHECK(r != b);
 #endif
-    secp256k1_fe_mul_inner(a->n, b->n, r->n);
+    secp256k1_fe_mul_inner(r->n, a->n, b->n);
 #ifdef VERIFY
    r->magnitude = 1;
    r->normalized = 0;
@ -892,7 +948,7 @@ static void secp256k1_fe_sqr(secp256k1_fe_t *r, const secp256k1_fe_t *a) {
    VERIFY_CHECK(a->magnitude <= 8);
    secp256k1_fe_verify(a);
 #endif
-    secp256k1_fe_sqr_inner(a->n, r->n);
+    secp256k1_fe_sqr_inner(r->n, a->n);
 #ifdef VERIFY
    r->magnitude = 1;
    r->normalized = 0;
--- a/src/field_5x52_asm.asm
+++ b/src/field_5x52_asm.asm
@ -1,469 +0,0 @@
-	;; Added by Diederik Huys, March 2013
-	;;
-	;; Provided public procedures:
-	;; 	secp256k1_fe_mul_inner
-	;; 	secp256k1_fe_sqr_inner
-	;;
-	;; Needed tools: YASM (http://yasm.tortall.net)
-	;;
-	;; 
-
-	BITS 64
-
-%ifidn   __OUTPUT_FORMAT__,macho64
-%define SYM(x) _ %+ x
-%else
-%define SYM(x) x
-%endif
-
-	;;  Procedure ExSetMult
-	;;  Register Layout:
-	;;  INPUT: 	rdi	= a->n
-	;; 	   	rsi  	= b->n
-	;; 	   	rdx  	= r->a
-	;; 
-	;;  INTERNAL:	rdx:rax  = multiplication accumulator
-	;; 		r9:r8    = c
-	;; 		r10-r13  = t0-t3
-	;; 		r14	 = b.n[0] / t4
-	;; 		r15	 = b.n[1] / t5
-	;; 		rbx	 = b.n[2] / t6
-	;; 		rcx	 = b.n[3] / t7
-	;; 		rbp	 = Constant 0FFFFFFFFFFFFFh / t8
-	;; 		rsi	 = b.n / b.n[4] / t9
-
-	GLOBAL SYM(secp256k1_fe_mul_inner)
-	ALIGN 32
-SYM(secp256k1_fe_mul_inner):
-	push rbp
-	push rbx
-	push r12
-	push r13
-	push r14
-	push r15
-	push rdx
-	mov r14,[rsi+8*0]	; preload b.n[0]. This will be the case until
-				; b.n[0] is no longer needed, then we reassign
-				; r14 to t4
-	;; c=a.n[0] * b.n[0]
-   	mov rax,[rdi+0*8]	; load a.n[0]
-	mov rbp,0FFFFFFFFFFFFFh
-	mul r14			; rdx:rax=a.n[0]*b.n[0]
-	mov r15,[rsi+1*8]
-	mov r10,rbp		; load modulus into target register for t0
-	mov r8,rax
-	and r10,rax		; only need lower qword of c
-	shrd r8,rdx,52
-	xor r9,r9		; c < 2^64, so we ditch the HO part 
-
-	;; c+=a.n[0] * b.n[1] + a.n[1] * b.n[0]
-	mov rax,[rdi+0*8]
-	mul r15			
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+1*8]
-	mul r14			
-	mov r11,rbp
-	mov rbx,[rsi+2*8]
-	add r8,rax
-	adc r9,rdx
-	and r11,r8
-	shrd r8,r9,52
-	xor r9,r9
-	
-	;; c+=a.n[0 1 2] * b.n[2 1 0]
-	mov rax,[rdi+0*8]
-	mul rbx			
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+1*8]
-	mul r15			
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+2*8]
-	mul r14
-	mov r12,rbp		
-	mov rcx,[rsi+3*8]
-	add r8,rax
-	adc r9,rdx
-	and r12,r8		
-	shrd r8,r9,52
-	xor r9,r9		
-
-	;; c+=a.n[0 1 2 3] * b.n[3 2 1 0]
-	mov rax,[rdi+0*8]
-	mul rcx			
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+1*8]
-	mul rbx			
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+2*8]
-	mul r15			
-	add r8,rax
-	adc r9,rdx
-	
-	mov rax,[rdi+3*8]
-	mul r14			
-	mov r13,rbp             
-	mov rsi,[rsi+4*8]	; load b.n[4] and destroy pointer
-	add r8,rax
-	adc r9,rdx
-	and r13,r8
-
-	shrd r8,r9,52
-	xor r9,r9		
-
-
-	;; c+=a.n[0 1 2 3 4] * b.n[4 3 2 1 0]
-	mov rax,[rdi+0*8]
-	mul rsi
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+1*8]
-	mul rcx
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+2*8]
-	mul rbx			
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+3*8]
-	mul r15			
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+4*8]
-	mul r14			
-	mov r14,rbp             ; load modulus into t4 and destroy a.n[0]
-	add r8,rax
-	adc r9,rdx
-	and r14,r8
-	shrd r8,r9,52
-	xor r9,r9		
-
-	;; c+=a.n[1 2 3 4] * b.n[4 3 2 1]
-	mov rax,[rdi+1*8]
-	mul rsi
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+2*8]
-	mul rcx
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+3*8]
-	mul rbx
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+4*8]
-	mul r15
-	mov r15,rbp		
-	add r8,rax
-	adc r9,rdx
-
-	and r15,r8
-	shrd r8,r9,52
-	xor r9,r9		
-
-	;; c+=a.n[2 3 4] * b.n[4 3 2]
-	mov rax,[rdi+2*8]
-	mul rsi
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+3*8]
-	mul rcx
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+4*8]
-	mul rbx
-	mov rbx,rbp		
-	add r8,rax
-	adc r9,rdx
-
-	and rbx,r8		
-	shrd r8,r9,52
-	xor r9,r9		
-
-	;; c+=a.n[3 4] * b.n[4 3]
-	mov rax,[rdi+3*8]
-	mul rsi
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,[rdi+4*8]
-	mul rcx
-	mov rcx,rbp		
-	add r8,rax
-	adc r9,rdx
-	and rcx,r8		
-	shrd r8,r9,52
-	xor r9,r9		
-
-	;; c+=a.n[4] * b.n[4]
-	mov rax,[rdi+4*8]
-	mul rsi
-	;; mov rbp,rbp		; modulus already there!
-	add r8,rax
-	adc r9,rdx
-	and rbp,r8 
-	shrd r8,r9,52
-	xor r9,r9		
-
-	mov rsi,r8		; load c into t9 and destroy b.n[4]
-
-	;; *******************************************************
-common_exit_norm:
-	mov rdi,01000003D10h	; load constant
-
-	mov rax,r15		; get t5
-	mul rdi
-	add rax,r10    		; +t0
-	adc rdx,0
-	mov r10,0FFFFFFFFFFFFFh ; modulus. Sadly, we ran out of registers!
-	mov r8,rax		; +c
-	and r10,rax
-	shrd r8,rdx,52
-	xor r9,r9
-
-	mov rax,rbx		; get t6
-	mul rdi
-	add rax,r11		; +t1
-	adc rdx,0
-	mov r11,0FFFFFFFFFFFFFh ; modulus
-	add r8,rax		; +c
-	adc r9,rdx
-	and r11,r8
-	shrd r8,r9,52
-	xor r9,r9
-
-	mov rax,rcx    		; get t7
-	mul rdi
-	add rax,r12		; +t2
-	adc rdx,0
-	pop rbx			; retrieve pointer to this.n	
-	mov r12,0FFFFFFFFFFFFFh ; modulus
-	add r8,rax		; +c
-	adc r9,rdx
-	and r12,r8
-	mov [rbx+2*8],r12	; mov into this.n[2]
-	shrd r8,r9,52
-	xor r9,r9
-	
-	mov rax,rbp    		; get t8
-	mul rdi
-	add rax,r13    		; +t3
-	adc rdx,0
-	mov r13,0FFFFFFFFFFFFFh ; modulus
-	add r8,rax		; +c
-	adc r9,rdx
-	and r13,r8
-	mov [rbx+3*8],r13	; -> this.n[3]
-	shrd r8,r9,52
-	xor r9,r9
-	
-	mov rax,rsi    		; get t9
-	mul rdi
-	add rax,r14    		; +t4
-	adc rdx,0
-	mov r14,0FFFFFFFFFFFFh	; !!!
-	add r8,rax		; +c
-	adc r9,rdx
-	and r14,r8
-	mov [rbx+4*8],r14	; -> this.n[4]
-	shrd r8,r9,48		; !!!
-	xor r9,r9
-	
-	mov rax,01000003D1h
-	mul r8		
-	add rax,r10
-	adc rdx,0
-	mov r10,0FFFFFFFFFFFFFh ; modulus
-	mov r8,rax
-	and rax,r10
-	shrd r8,rdx,52
-	mov [rbx+0*8],rax	; -> this.n[0]
-	add r8,r11
-	mov [rbx+1*8],r8	; -> this.n[1]
-
-	pop r15
-	pop r14
-	pop r13
-	pop r12
-	pop rbx
-	pop rbp
-	ret
-
-	
-	;;  PROC ExSetSquare
-	;;  Register Layout:
-	;;  INPUT: 	rdi	 = a.n
-	;; 	   	rsi  	 = this.a
-	;;  INTERNAL:	rdx:rax  = multiplication accumulator
-	;; 		r9:r8    = c
-	;; 		r10-r13  = t0-t3
-	;; 		r14	 = a.n[0] / t4
-	;; 		r15	 = a.n[1] / t5
-	;; 		rbx	 = a.n[2] / t6
-	;; 		rcx	 = a.n[3] / t7
-	;; 		rbp	 = 0FFFFFFFFFFFFFh / t8
-	;; 		rsi	 = a.n[4] / t9
-	GLOBAL SYM(secp256k1_fe_sqr_inner)
-	ALIGN 32
-SYM(secp256k1_fe_sqr_inner):
-	push rbp
-	push rbx
-	push r12
-	push r13
-	push r14
-	push r15
-	push rsi
-	mov rbp,0FFFFFFFFFFFFFh
-	
-	;; c=a.n[0] * a.n[0]
-   	mov r14,[rdi+0*8]	; r14=a.n[0]
-	mov r10,rbp		; modulus 
-	mov rax,r14
-	mul rax
-	mov r15,[rdi+1*8]	; a.n[1]
-	add r14,r14		; r14=2*a.n[0]
-	mov r8,rax
-	and r10,rax		; only need lower qword
-	shrd r8,rdx,52
-	xor r9,r9
-
-	;; c+=2*a.n[0] * a.n[1]
-	mov rax,r14		; r14=2*a.n[0]
-	mul r15
-	mov rbx,[rdi+2*8]	; rbx=a.n[2]
-	mov r11,rbp 		; modulus
-	add r8,rax
-	adc r9,rdx
-	and r11,r8
-	shrd r8,r9,52
-	xor r9,r9
-	
-	;; c+=2*a.n[0]*a.n[2]+a.n[1]*a.n[1]
-	mov rax,r14
-	mul rbx
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,r15
-	mov r12,rbp		; modulus
-	mul rax
-	mov rcx,[rdi+3*8]	; rcx=a.n[3]
-	add r15,r15		; r15=a.n[1]*2
-	add r8,rax
-	adc r9,rdx
-	and r12,r8		; only need lower dword
-	shrd r8,r9,52
-	xor r9,r9		
-
-	;; c+=2*a.n[0]*a.n[3]+2*a.n[1]*a.n[2]
-	mov rax,r14
-	mul rcx
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,r15		; rax=2*a.n[1]
-	mov r13,rbp		; modulus
-	mul rbx
-	mov rsi,[rdi+4*8]	; rsi=a.n[4]
-	add r8,rax
-	adc r9,rdx
-	and r13,r8
-	shrd r8,r9,52
-	xor r9,r9		
-
-	;; c+=2*a.n[0]*a.n[4]+2*a.n[1]*a.n[3]+a.n[2]*a.n[2]
-	mov rax,r14		; last time we need 2*a.n[0]
-	mul rsi
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,r15
-	mul rcx
-	mov r14,rbp		; modulus
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,rbx
-	mul rax
-	add rbx,rbx		; rcx=2*a.n[2]
-	add r8,rax
-	adc r9,rdx
-	and r14,r8
-	shrd r8,r9,52
-	xor r9,r9		
-
-	;; c+=2*a.n[1]*a.n[4]+2*a.n[2]*a.n[3]
-	mov rax,r15		; last time we need 2*a.n[1]
-	mul rsi
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,rbx
-	mul rcx
-	mov r15,rbp		; modulus
-	add r8,rax
-	adc r9,rdx
-	and r15,r8
-	shrd r8,r9,52
-	xor r9,r9		
-
-	;; c+=2*a.n[2]*a.n[4]+a.n[3]*a.n[3]
-	mov rax,rbx		; last time we need 2*a.n[2]
-	mul rsi
-	add r8,rax
-	adc r9,rdx
-
-	mov rax,rcx		; a.n[3]
-	mul rax
-	mov rbx,rbp		; modulus
-	add r8,rax
-	adc r9,rdx
-	and rbx,r8		; only need lower dword
-	lea rax,[2*rcx]
-	shrd r8,r9,52
-	xor r9,r9		
-
-	;; c+=2*a.n[3]*a.n[4]
-	mul rsi
-	mov rcx,rbp		; modulus
-	add r8,rax
-	adc r9,rdx
-	and rcx,r8		; only need lower dword
-	shrd r8,r9,52
-	xor r9,r9		
-
-	;; c+=a.n[4]*a.n[4]
-	mov rax,rsi
-	mul rax
-	;; mov rbp,rbp		; modulus is already there!
-	add r8,rax
-	adc r9,rdx
-	and rbp,r8 
-	shrd r8,r9,52
-	xor r9,r9		
-
-	mov rsi,r8
-
-	;; *******************************************************
-	jmp common_exit_norm
-	end
-
-	
--- a/src/field_5x52_asm_impl.h
+++ b/src/field_5x52_asm_impl.h
@ -1,13 +1,502 @@
 /**********************************************************************
- * Copyright (c) 2013 Pieter Wuille                                   *
+ * Copyright (c) 2013-2014 Diederik Huys, Pieter Wuille               *
 * Distributed under the MIT software license, see the accompanying   *
 * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
 **********************************************************************/

+/**
+ * Changelog:
+ * - March 2013, Diederik Huys:    original version
+ * - November 2014, Pieter Wuille: updated to use Peter Dettman's parallel multiplication algorithm
+ * - December 2014, Pieter Wuille: converted from YASM to GCC inline assembly
+ */
+
 #ifndef _SECP256K1_FIELD_INNER5X52_IMPL_H_
 #define _SECP256K1_FIELD_INNER5X52_IMPL_H_

-void __attribute__ ((sysv_abi)) secp256k1_fe_mul_inner(const uint64_t *a, const uint64_t *b, uint64_t *r);
-void __attribute__ ((sysv_abi)) secp256k1_fe_sqr_inner(const uint64_t *a, uint64_t *r);
+SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) {
+/**
+ * Registers: rdx:rax = multiplication accumulator
+ *            r9:r8   = c
+ *            r15:rcx = d
+ *            r10-r14 = a0-a4
+ *            rbx     = b
+ *            rdi     = r
+ *            rsi     = a / t?
+ */
+  uint64_t tmp1, tmp2, tmp3;
+__asm__ __volatile__(
+    "movq 0(%%rsi),%%r10\n"
+    "movq 8(%%rsi),%%r11\n"
+    "movq 16(%%rsi),%%r12\n"
+    "movq 24(%%rsi),%%r13\n"
+    "movq 32(%%rsi),%%r14\n"
+
+    /* d += a3 * b0 */
+    "movq 0(%%rbx),%%rax\n"
+    "mulq %%r13\n"
+    "movq %%rax,%%rcx\n"
+    "movq %%rdx,%%r15\n"
+    /* d += a2 * b1 */
+    "movq 8(%%rbx),%%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a1 * b2 */
+    "movq 16(%%rbx),%%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d = a0 * b3 */
+    "movq 24(%%rbx),%%rax\n"
+    "mulq %%r10\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* c = a4 * b4 */
+    "movq 32(%%rbx),%%rax\n"
+    "mulq %%r14\n"
+    "movq %%rax,%%r8\n"
+    "movq %%rdx,%%r9\n"
+    /* d += (c & M) * R */
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%%rax\n"
+    "movq $0x1000003d10,%%rdx\n"
+    "mulq %%rdx\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* c >>= 52 (%%r8 only) */
+    "shrdq $52,%%r9,%%r8\n"
+    /* t3 (tmp1) = d & M */
+    "movq %%rcx,%%rsi\n"
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%%rsi\n"
+    "movq %%rsi,%q1\n"
+    /* d >>= 52 */
+    "shrdq $52,%%r15,%%rcx\n"
+    "xorq %%r15,%%r15\n"
+    /* d += a4 * b0 */
+    "movq 0(%%rbx),%%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a3 * b1 */
+    "movq 8(%%rbx),%%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a2 * b2 */
+    "movq 16(%%rbx),%%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a1 * b3 */
+    "movq 24(%%rbx),%%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a0 * b4 */
+    "movq 32(%%rbx),%%rax\n"
+    "mulq %%r10\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += c * R */
+    "movq %%r8,%%rax\n"
+    "movq $0x1000003d10,%%rdx\n"
+    "mulq %%rdx\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* t4 = d & M (%%rsi) */
+    "movq %%rcx,%%rsi\n"
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%%rsi\n"
+    /* d >>= 52 */
+    "shrdq $52,%%r15,%%rcx\n"
+    "xorq %%r15,%%r15\n"
+    /* tx = t4 >> 48 (tmp3) */
+    "movq %%rsi,%%rax\n"
+    "shrq $48,%%rax\n"
+    "movq %%rax,%q3\n"
+    /* t4 &= (M >> 4) (tmp2) */
+    "movq $0xffffffffffff,%%rax\n"
+    "andq %%rax,%%rsi\n"
+    "movq %%rsi,%q2\n"
+    /* c = a0 * b0 */
+    "movq 0(%%rbx),%%rax\n"
+    "mulq %%r10\n"
+    "movq %%rax,%%r8\n"
+    "movq %%rdx,%%r9\n"
+    /* d += a4 * b1 */
+    "movq 8(%%rbx),%%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a3 * b2 */
+    "movq 16(%%rbx),%%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a2 * b3 */
+    "movq 24(%%rbx),%%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a1 * b4 */
+    "movq 32(%%rbx),%%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* u0 = d & M (%%rsi) */
+    "movq %%rcx,%%rsi\n"
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%%rsi\n"
+    /* d >>= 52 */
+    "shrdq $52,%%r15,%%rcx\n"
+    "xorq %%r15,%%r15\n"
+    /* u0 = (u0 << 4) | tx (%%rsi) */
+    "shlq $4,%%rsi\n"
+    "movq %q3,%%rax\n"
+    "orq %%rax,%%rsi\n"
+    /* c += u0 * (R >> 4) */
+    "movq $0x1000003d1,%%rax\n"
+    "mulq %%rsi\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* r[0] = c & M */
+    "movq %%r8,%%rax\n"
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%%rax\n"
+    "movq %%rax,0(%%rdi)\n"
+    /* c >>= 52 */
+    "shrdq $52,%%r9,%%r8\n"
+    "xorq %%r9,%%r9\n"
+    /* c += a1 * b0 */
+    "movq 0(%%rbx),%%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* c += a0 * b1 */
+    "movq 8(%%rbx),%%rax\n"
+    "mulq %%r10\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* d += a4 * b2 */
+    "movq 16(%%rbx),%%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a3 * b3 */
+    "movq 24(%%rbx),%%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a2 * b4 */
+    "movq 32(%%rbx),%%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* c += (d & M) * R */
+    "movq %%rcx,%%rax\n"
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%%rax\n"
+    "movq $0x1000003d10,%%rdx\n"
+    "mulq %%rdx\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* d >>= 52 */
+    "shrdq $52,%%r15,%%rcx\n"
+    "xorq %%r15,%%r15\n"
+    /* r[1] = c & M */
+    "movq %%r8,%%rax\n"
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%%rax\n"
+    "movq %%rax,8(%%rdi)\n"
+    /* c >>= 52 */
+    "shrdq $52,%%r9,%%r8\n"
+    "xorq %%r9,%%r9\n"
+    /* c += a2 * b0 */
+    "movq 0(%%rbx),%%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* c += a1 * b1 */
+    "movq 8(%%rbx),%%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* c += a0 * b2 (last use of %%r10 = a0) */
+    "movq 16(%%rbx),%%rax\n"
+    "mulq %%r10\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* fetch t3 (%%r10, overwrites a0), t4 (%%rsi) */
+    "movq %q2,%%rsi\n"
+    "movq %q1,%%r10\n"
+    /* d += a4 * b3 */
+    "movq 24(%%rbx),%%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* d += a3 * b4 */
+    "movq 32(%%rbx),%%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax,%%rcx\n"
+    "adcq %%rdx,%%r15\n"
+    /* c += (d & M) * R */
+    "movq %%rcx,%%rax\n"
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%%rax\n"
+    "movq $0x1000003d10,%%rdx\n"
+    "mulq %%rdx\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* d >>= 52 (%%rcx only) */
+    "shrdq $52,%%r15,%%rcx\n"
+    /* r[2] = c & M */
+    "movq %%r8,%%rax\n"
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%%rax\n"
+    "movq %%rax,16(%%rdi)\n"
+    /* c >>= 52 */
+    "shrdq $52,%%r9,%%r8\n"
+    "xorq %%r9,%%r9\n"
+    /* c += t3 */
+    "addq %%r10,%%r8\n"
+    /* c += d * R */
+    "movq %%rcx,%%rax\n"
+    "movq $0x1000003d10,%%rdx\n"
+    "mulq %%rdx\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* r[3] = c & M */
+    "movq %%r8,%%rax\n"
+    "movq $0xfffffffffffff,%%rdx\n"
+    "andq %%rdx,%%rax\n"
+    "movq %%rax,24(%%rdi)\n"
+    /* c >>= 52 (%%r8 only) */
+    "shrdq $52,%%r9,%%r8\n"
+    /* c += t4 (%%r8 only) */
+    "addq %%rsi,%%r8\n"
+    /* r[4] = c */
+    "movq %%r8,32(%%rdi)\n"
+: "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3)
+: "b"(b), "D"(r)
+: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory"
+);
+}
+
+SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) {
+/**
+ * Registers: rdx:rax = multiplication accumulator
+ *            r9:r8   = c
+ *            rcx:rbx = d
+ *            r10-r14 = a0-a4
+ *            r15     = M (0xfffffffffffff)
+ *            rdi     = r
+ *            rsi     = a / t?
+ */
+  uint64_t tmp1, tmp2, tmp3;
+__asm__ __volatile__(
+    "movq 0(%%rsi),%%r10\n"
+    "movq 8(%%rsi),%%r11\n"
+    "movq 16(%%rsi),%%r12\n"
+    "movq 24(%%rsi),%%r13\n"
+    "movq 32(%%rsi),%%r14\n"
+    "movq $0xfffffffffffff,%%r15\n"
+
+    /* d = (a0*2) * a3 */
+    "leaq (%%r10,%%r10,1),%%rax\n"
+    "mulq %%r13\n"
+    "movq %%rax,%%rbx\n"
+    "movq %%rdx,%%rcx\n"
+    /* d += (a1*2) * a2 */
+    "leaq (%%r11,%%r11,1),%%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* c = a4 * a4 */
+    "movq %%r14,%%rax\n"
+    "mulq %%r14\n"
+    "movq %%rax,%%r8\n"
+    "movq %%rdx,%%r9\n"
+    /* d += (c & M) * R */
+    "andq %%r15,%%rax\n"
+    "movq $0x1000003d10,%%rdx\n"
+    "mulq %%rdx\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* c >>= 52 (%%r8 only) */
+    "shrdq $52,%%r9,%%r8\n"
+    /* t3 (tmp1) = d & M */
+    "movq %%rbx,%%rsi\n"
+    "andq %%r15,%%rsi\n"
+    "movq %%rsi,%q1\n"
+    /* d >>= 52 */
+    "shrdq $52,%%rcx,%%rbx\n"
+    "xorq %%rcx,%%rcx\n"
+    /* a4 *= 2 */
+    "addq %%r14,%%r14\n"
+    /* d += a0 * a4 */
+    "movq %%r10,%%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* d+= (a1*2) * a3 */
+    "leaq (%%r11,%%r11,1),%%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* d += a2 * a2 */
+    "movq %%r12,%%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* d += c * R */
+    "movq %%r8,%%rax\n"
+    "movq $0x1000003d10,%%rdx\n"
+    "mulq %%rdx\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* t4 = d & M (%%rsi) */
+    "movq %%rbx,%%rsi\n"
+    "andq %%r15,%%rsi\n"
+    /* d >>= 52 */
+    "shrdq $52,%%rcx,%%rbx\n"
+    "xorq %%rcx,%%rcx\n"
+    /* tx = t4 >> 48 (tmp3) */
+    "movq %%rsi,%%rax\n"
+    "shrq $48,%%rax\n"
+    "movq %%rax,%q3\n"
+    /* t4 &= (M >> 4) (tmp2) */
+    "movq $0xffffffffffff,%%rax\n"
+    "andq %%rax,%%rsi\n"
+    "movq %%rsi,%q2\n"
+    /* c = a0 * a0 */
+    "movq %%r10,%%rax\n"
+    "mulq %%r10\n"
+    "movq %%rax,%%r8\n"
+    "movq %%rdx,%%r9\n"
+    /* d += a1 * a4 */
+    "movq %%r11,%%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* d += (a2*2) * a3 */
+    "leaq (%%r12,%%r12,1),%%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* u0 = d & M (%%rsi) */
+    "movq %%rbx,%%rsi\n"
+    "andq %%r15,%%rsi\n"
+    /* d >>= 52 */
+    "shrdq $52,%%rcx,%%rbx\n"
+    "xorq %%rcx,%%rcx\n"
+    /* u0 = (u0 << 4) | tx (%%rsi) */
+    "shlq $4,%%rsi\n"
+    "movq %q3,%%rax\n"
+    "orq %%rax,%%rsi\n"
+    /* c += u0 * (R >> 4) */
+    "movq $0x1000003d1,%%rax\n"
+    "mulq %%rsi\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* r[0] = c & M */
+    "movq %%r8,%%rax\n"
+    "andq %%r15,%%rax\n"
+    "movq %%rax,0(%%rdi)\n"
+    /* c >>= 52 */
+    "shrdq $52,%%r9,%%r8\n"
+    "xorq %%r9,%%r9\n"
+    /* a0 *= 2 */
+    "addq %%r10,%%r10\n"
+    /* c += a0 * a1 */
+    "movq %%r10,%%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* d += a2 * a4 */
+    "movq %%r12,%%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* d += a3 * a3 */
+    "movq %%r13,%%rax\n"
+    "mulq %%r13\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* c += (d & M) * R */
+    "movq %%rbx,%%rax\n"
+    "andq %%r15,%%rax\n"
+    "movq $0x1000003d10,%%rdx\n"
+    "mulq %%rdx\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* d >>= 52 */
+    "shrdq $52,%%rcx,%%rbx\n"
+    "xorq %%rcx,%%rcx\n"
+    /* r[1] = c & M */
+    "movq %%r8,%%rax\n"
+    "andq %%r15,%%rax\n"
+    "movq %%rax,8(%%rdi)\n"
+    /* c >>= 52 */
+    "shrdq $52,%%r9,%%r8\n"
+    "xorq %%r9,%%r9\n"
+    /* c += a0 * a2 (last use of %%r10) */
+    "movq %%r10,%%rax\n"
+    "mulq %%r12\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* fetch t3 (%%r10, overwrites a0),t4 (%%rsi) */
+    "movq %q2,%%rsi\n"
+    "movq %q1,%%r10\n"
+    /* c += a1 * a1 */
+    "movq %%r11,%%rax\n"
+    "mulq %%r11\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* d += a3 * a4 */
+    "movq %%r13,%%rax\n"
+    "mulq %%r14\n"
+    "addq %%rax,%%rbx\n"
+    "adcq %%rdx,%%rcx\n"
+    /* c += (d & M) * R */
+    "movq %%rbx,%%rax\n"
+    "andq %%r15,%%rax\n"
+    "movq $0x1000003d10,%%rdx\n"
+    "mulq %%rdx\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* d >>= 52 (%%rbx only) */
+    "shrdq $52,%%rcx,%%rbx\n"
+    /* r[2] = c & M */
+    "movq %%r8,%%rax\n"
+    "andq %%r15,%%rax\n"
+    "movq %%rax,16(%%rdi)\n"
+    /* c >>= 52 */
+    "shrdq $52,%%r9,%%r8\n"
+    "xorq %%r9,%%r9\n"
+    /* c += t3 */
+    "addq %%r10,%%r8\n"
+    /* c += d * R */
+    "movq %%rbx,%%rax\n"
+    "movq $0x1000003d10,%%rdx\n"
+    "mulq %%rdx\n"
+    "addq %%rax,%%r8\n"
+    "adcq %%rdx,%%r9\n"
+    /* r[3] = c & M */
+    "movq %%r8,%%rax\n"
+    "andq %%r15,%%rax\n"
+    "movq %%rax,24(%%rdi)\n"
+    /* c >>= 52 (%%r8 only) */
+    "shrdq $52,%%r9,%%r8\n"
+    /* c += t4 (%%r8 only) */
+    "addq %%rsi,%%r8\n"
+    /* r[4] = c */
+    "movq %%r8,32(%%rdi)\n"
+: "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3)
+: "D"(r)
+: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory"
+);
+}

 #endif
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@ -102,6 +102,50 @@ static void secp256k1_fe_normalize(secp256k1_fe_t *r) {
 #endif
 }

+static void secp256k1_fe_normalize_var(secp256k1_fe_t *r) {
+    uint64_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4];
+
+    /* Reduce t4 at the start so there will be at most a single carry from the first pass */
+    uint64_t x = t4 >> 48; t4 &= 0x0FFFFFFFFFFFFULL;
+    uint64_t m;
+
+    /* The first pass ensures the magnitude is 1, ... */
+    t0 += x * 0x1000003D1ULL;
+    t1 += (t0 >> 52); t0 &= 0xFFFFFFFFFFFFFULL;
+    t2 += (t1 >> 52); t1 &= 0xFFFFFFFFFFFFFULL; m = t1;
+    t3 += (t2 >> 52); t2 &= 0xFFFFFFFFFFFFFULL; m &= t2;
+    t4 += (t3 >> 52); t3 &= 0xFFFFFFFFFFFFFULL; m &= t3;
+
+    /* ... except for a possible carry at bit 48 of t4 (i.e. bit 256 of the field element) */
+    VERIFY_CHECK(t4 >> 49 == 0);
+
+    /* At most a single final reduction is needed; check if the value is >= the field characteristic */
+    x = (t4 >> 48) | ((t4 == 0x0FFFFFFFFFFFFULL) & (m == 0xFFFFFFFFFFFFFULL)
+        & (t0 >= 0xFFFFEFFFFFC2FULL));
+
+    if (x) {
+        t0 += 0x1000003D1ULL;
+        t1 += (t0 >> 52); t0 &= 0xFFFFFFFFFFFFFULL;
+        t2 += (t1 >> 52); t1 &= 0xFFFFFFFFFFFFFULL;
+        t3 += (t2 >> 52); t2 &= 0xFFFFFFFFFFFFFULL;
+        t4 += (t3 >> 52); t3 &= 0xFFFFFFFFFFFFFULL;
+
+        /* If t4 didn't carry to bit 48 already, then it should have after any final reduction */
+        VERIFY_CHECK(t4 >> 48 == x);
+
+        /* Mask off the possible multiple of 2^256 from the final reduction */
+        t4 &= 0x0FFFFFFFFFFFFULL;
+    }
+
+    r->n[0] = t0; r->n[1] = t1; r->n[2] = t2; r->n[3] = t3; r->n[4] = t4;
+
+#ifdef VERIFY
+    r->magnitude = 1;
+    r->normalized = 1;
+    secp256k1_fe_verify(r);
+#endif
+}
+
 SECP256K1_INLINE static void secp256k1_fe_set_int(secp256k1_fe_t *r, int a) {
    r->n[0] = a;
    r->n[1] = r->n[2] = r->n[3] = r->n[4] = 0;
@ -255,7 +299,7 @@ static void secp256k1_fe_mul(secp256k1_fe_t *r, const secp256k1_fe_t *a, const s
    secp256k1_fe_verify(b);
    VERIFY_CHECK(r != b);
 #endif
-    secp256k1_fe_mul_inner(a->n, b->n, r->n);
+    secp256k1_fe_mul_inner(r->n, a->n, b->n);
 #ifdef VERIFY
    r->magnitude = 1;
    r->normalized = 0;
@ -268,7 +312,7 @@ static void secp256k1_fe_sqr(secp256k1_fe_t *r, const secp256k1_fe_t *a) {
    VERIFY_CHECK(a->magnitude <= 8);
    secp256k1_fe_verify(a);
 #endif
-    secp256k1_fe_sqr_inner(a->n, r->n);
+    secp256k1_fe_sqr_inner(r->n, a->n);
 #ifdef VERIFY
    r->magnitude = 1;
    r->normalized = 0;
--- a/src/field_5x52_int128_impl.h
+++ b/src/field_5x52_int128_impl.h
@ -15,7 +15,7 @@
 #define VERIFY_BITS(x, n) do { } while(0)
 #endif

-SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b, uint64_t *r) {
+SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) {
    VERIFY_BITS(a[0], 56);
    VERIFY_BITS(a[1], 56);
    VERIFY_BITS(a[2], 56);
@ -152,7 +152,7 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint64_t *a, const uin
    /* [r4 r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */
 }

-SECP256K1_INLINE static void secp256k1_fe_sqr_inner(const uint64_t *a, uint64_t *r) {
+SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) {
    VERIFY_BITS(a[0], 56);
    VERIFY_BITS(a[1], 56);
    VERIFY_BITS(a[2], 56);
--- a/src/field_gmp_impl.h
+++ b/src/field_gmp_impl.h
@ -46,6 +46,10 @@ static void secp256k1_fe_normalize(secp256k1_fe_t *r) {
        mpn_sub(r->n, r->n, FIELD_LIMBS, secp256k1_field_p, FIELD_LIMBS);
 }

+static void secp256k1_fe_normalize_var(secp256k1_fe_t *r) {
+    secp256k1_fe_normalize(r);
+}
+
 SECP256K1_INLINE static void secp256k1_fe_set_int(secp256k1_fe_t *r, int a) {
    r->n[0] = a;
    for (int i=1; i<FIELD_LIMBS+1; i++)
--- a/src/field_impl.h
+++ b/src/field_impl.h
@ -66,7 +66,7 @@ static int secp256k1_fe_set_hex(secp256k1_fe_t *r, const char *a, int alen) {
    return secp256k1_fe_set_b32(r, tmp);
 }

-static int secp256k1_fe_sqrt(secp256k1_fe_t *r, const secp256k1_fe_t *a) {
+static int secp256k1_fe_sqrt_var(secp256k1_fe_t *r, const secp256k1_fe_t *a) {

    /** The binary representation of (p + 1)/4 has 3 blocks of 1s, with lengths in
     *  { 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
@ -132,7 +132,7 @@ static int secp256k1_fe_sqrt(secp256k1_fe_t *r, const secp256k1_fe_t *a) {
    secp256k1_fe_sqr(&t1, r);
    secp256k1_fe_negate(&t1, &t1, 1);
    secp256k1_fe_add(&t1, a);
-    secp256k1_fe_normalize(&t1);
+    secp256k1_fe_normalize_var(&t1);
    return secp256k1_fe_is_zero(&t1);
 }

@ -206,7 +206,7 @@ static void secp256k1_fe_inv_var(secp256k1_fe_t *r, const secp256k1_fe_t *a) {
 #elif defined(USE_FIELD_INV_NUM)
    unsigned char b[32];
    secp256k1_fe_t c = *a;
-    secp256k1_fe_normalize(&c);
+    secp256k1_fe_normalize_var(&c);
    secp256k1_fe_get_b32(b, &c);
    secp256k1_num_t n;
    secp256k1_num_set_bin(&n, b, 32);
@ -218,30 +218,6 @@ static void secp256k1_fe_inv_var(secp256k1_fe_t *r, const secp256k1_fe_t *a) {
 #endif
 }

-static void secp256k1_fe_inv_all(size_t len, secp256k1_fe_t r[len], const secp256k1_fe_t a[len]) {
-    if (len < 1)
-        return;
-
-    VERIFY_CHECK((r + len <= a) || (a + len <= r));
-
-    r[0] = a[0];
-
-    size_t i = 0;
-    while (++i < len) {
-        secp256k1_fe_mul(&r[i], &r[i - 1], &a[i]);
-    }
-
-    secp256k1_fe_t u; secp256k1_fe_inv(&u, &r[--i]);
-
-    while (i > 0) {
-        int j = i--;
-        secp256k1_fe_mul(&r[j], &r[i], &u);
-        secp256k1_fe_mul(&u, &u, &a[j]);
-    }
-
-    r[0] = u;
-}
-
 static void secp256k1_fe_inv_all_var(size_t len, secp256k1_fe_t r[len], const secp256k1_fe_t a[len]) {
    if (len < 1)
        return;
@ -277,7 +253,7 @@ static void secp256k1_fe_start(void) {
 #endif
    if (secp256k1_fe_consts == NULL) {
        secp256k1_fe_inner_start();
-        secp256k1_fe_consts_t *ret = (secp256k1_fe_consts_t*)malloc(sizeof(secp256k1_fe_consts_t));
+        secp256k1_fe_consts_t *ret = (secp256k1_fe_consts_t*)checked_malloc(sizeof(secp256k1_fe_consts_t));
 #ifndef USE_NUM_NONE
        secp256k1_num_set_bin(&ret->p, secp256k1_fe_consts_p, sizeof(secp256k1_fe_consts_p));
 #endif
--- a/src/group.h
+++ b/src/group.h
@ -51,15 +51,16 @@ static void secp256k1_ge_set_xy(secp256k1_ge_t *r, const secp256k1_fe_t *x, cons

 /** Set a group element (affine) equal to the point with the given X coordinate, and given oddness
 *  for Y. Return value indicates whether the result is valid. */
-static int secp256k1_ge_set_xo(secp256k1_ge_t *r, const secp256k1_fe_t *x, int odd);
+static int secp256k1_ge_set_xo_var(secp256k1_ge_t *r, const secp256k1_fe_t *x, int odd);

 /** Check whether a group element is the point at infinity. */
 static int secp256k1_ge_is_infinity(const secp256k1_ge_t *a);

 /** Check whether a group element is valid (i.e., on the curve). */
-static int secp256k1_ge_is_valid(const secp256k1_ge_t *a);
+static int secp256k1_ge_is_valid_var(const secp256k1_ge_t *a);

 static void secp256k1_ge_neg(secp256k1_ge_t *r, const secp256k1_ge_t *a);
+static void secp256k1_ge_neg_var(secp256k1_ge_t *r, const secp256k1_ge_t *a);

 /** Get a hex representation of a point. *rlen will be overwritten with the real length. */
 static void secp256k1_ge_get_hex(char *r, int *rlen, const secp256k1_ge_t *a);
@ -84,7 +85,7 @@ static void secp256k1_gej_set_ge(secp256k1_gej_t *r, const secp256k1_ge_t *a);
 static void secp256k1_gej_get_x_var(secp256k1_fe_t *r, const secp256k1_gej_t *a);

 /** Set r equal to the inverse of a (i.e., mirrored around the X axis) */
-static void secp256k1_gej_neg(secp256k1_gej_t *r, const secp256k1_gej_t *a);
+static void secp256k1_gej_neg_var(secp256k1_gej_t *r, const secp256k1_gej_t *a);

 /** Check whether a group element is the point at infinity. */
 static int secp256k1_gej_is_infinity(const secp256k1_gej_t *a);
--- a/src/group_impl.h
+++ b/src/group_impl.h
@ -28,13 +28,17 @@ static int secp256k1_ge_is_infinity(const secp256k1_ge_t *a) {
 }

 static void secp256k1_ge_neg(secp256k1_ge_t *r, const secp256k1_ge_t *a) {
-    r->infinity = a->infinity;
-    r->x = a->x;
-    r->y = a->y;
+    *r = *a;
    secp256k1_fe_normalize(&r->y);
    secp256k1_fe_negate(&r->y, &r->y, 1);
 }

+static void secp256k1_ge_neg_var(secp256k1_ge_t *r, const secp256k1_ge_t *a) {
+    *r = *a;
+    secp256k1_fe_normalize_var(&r->y);
+    secp256k1_fe_negate(&r->y, &r->y, 1);
+}
+
 static void secp256k1_ge_get_hex(char *r, int *rlen, const secp256k1_ge_t *a) {
    char cx[65]; int lx=65;
    char cy[65]; int ly=65;
@ -85,15 +89,16 @@ static void secp256k1_ge_set_gej_var(secp256k1_ge_t *r, secp256k1_gej_t *a) {

 static void secp256k1_ge_set_all_gej_var(size_t len, secp256k1_ge_t r[len], const secp256k1_gej_t a[len]) {
    size_t count = 0;
-    secp256k1_fe_t az[len];
+    secp256k1_fe_t *az = checked_malloc(sizeof(secp256k1_fe_t) * len);
    for (size_t i=0; i<len; i++) {
        if (!a[i].infinity) {
            az[count++] = a[i].z;
        }
    }

-    secp256k1_fe_t azi[count];
+    secp256k1_fe_t *azi = checked_malloc(sizeof(secp256k1_fe_t) * count);
    secp256k1_fe_inv_all_var(count, azi, az);
+    free(az);

    count = 0;
    for (size_t i=0; i<len; i++) {
@ -106,6 +111,7 @@ static void secp256k1_ge_set_all_gej_var(size_t len, secp256k1_ge_t r[len], cons
            secp256k1_fe_mul(&r[i].y, &a[i].y, &zi3);
        }
    }
+    free(azi);
 }

 static void secp256k1_gej_set_infinity(secp256k1_gej_t *r) {
@ -135,16 +141,16 @@ static void secp256k1_ge_clear(secp256k1_ge_t *r) {
    secp256k1_fe_clear(&r->y);
 }

-static int secp256k1_ge_set_xo(secp256k1_ge_t *r, const secp256k1_fe_t *x, int odd) {
+static int secp256k1_ge_set_xo_var(secp256k1_ge_t *r, const secp256k1_fe_t *x, int odd) {
    r->x = *x;
    secp256k1_fe_t x2; secp256k1_fe_sqr(&x2, x);
    secp256k1_fe_t x3; secp256k1_fe_mul(&x3, x, &x2);
    r->infinity = 0;
    secp256k1_fe_t c; secp256k1_fe_set_int(&c, 7);
    secp256k1_fe_add(&c, &x3);
-    if (!secp256k1_fe_sqrt(&r->y, &c))
+    if (!secp256k1_fe_sqrt_var(&r->y, &c))
        return 0;
-    secp256k1_fe_normalize(&r->y);
+    secp256k1_fe_normalize_var(&r->y);
    if (secp256k1_fe_is_odd(&r->y) != odd)
        secp256k1_fe_negate(&r->y, &r->y, 1);
    return 1;
@ -162,12 +168,12 @@ static void secp256k1_gej_get_x_var(secp256k1_fe_t *r, const secp256k1_gej_t *a)
    secp256k1_fe_mul(r, &a->x, &zi2);
 }

-static void secp256k1_gej_neg(secp256k1_gej_t *r, const secp256k1_gej_t *a) {
+static void secp256k1_gej_neg_var(secp256k1_gej_t *r, const secp256k1_gej_t *a) {
    r->infinity = a->infinity;
    r->x = a->x;
    r->y = a->y;
    r->z = a->z;
-    secp256k1_fe_normalize(&r->y);
+    secp256k1_fe_normalize_var(&r->y);
    secp256k1_fe_negate(&r->y, &r->y, 1);
 }

@ -175,7 +181,7 @@ static int secp256k1_gej_is_infinity(const secp256k1_gej_t *a) {
    return a->infinity;
 }

-static int secp256k1_gej_is_valid(const secp256k1_gej_t *a) {
+static int secp256k1_gej_is_valid_var(const secp256k1_gej_t *a) {
    if (a->infinity)
        return 0;
    /** y^2 = x^3 + 7
@ -189,12 +195,12 @@ static int secp256k1_gej_is_valid(const secp256k1_gej_t *a) {
    secp256k1_fe_t z6; secp256k1_fe_sqr(&z6, &z2); secp256k1_fe_mul(&z6, &z6, &z2);
    secp256k1_fe_mul_int(&z6, 7);
    secp256k1_fe_add(&x3, &z6);
-    secp256k1_fe_normalize(&y2);
-    secp256k1_fe_normalize(&x3);
+    secp256k1_fe_normalize_var(&y2);
+    secp256k1_fe_normalize_var(&x3);
    return secp256k1_fe_equal(&y2, &x3);
 }

-static int secp256k1_ge_is_valid(const secp256k1_ge_t *a) {
+static int secp256k1_ge_is_valid_var(const secp256k1_ge_t *a) {
    if (a->infinity)
        return 0;
    /* y^2 = x^3 + 7 */
@ -202,8 +208,8 @@ static int secp256k1_ge_is_valid(const secp256k1_ge_t *a) {
    secp256k1_fe_t x3; secp256k1_fe_sqr(&x3, &a->x); secp256k1_fe_mul(&x3, &x3, &a->x);
    secp256k1_fe_t c; secp256k1_fe_set_int(&c, 7);
    secp256k1_fe_add(&x3, &c);
-    secp256k1_fe_normalize(&y2);
-    secp256k1_fe_normalize(&x3);
+    secp256k1_fe_normalize_var(&y2);
+    secp256k1_fe_normalize_var(&x3);
    return secp256k1_fe_equal(&y2, &x3);
 }

@ -255,11 +261,11 @@ static void secp256k1_gej_add_var(secp256k1_gej_t *r, const secp256k1_gej_t *a,
    secp256k1_fe_t u2; secp256k1_fe_mul(&u2, &b->x, &z12);
    secp256k1_fe_t s1; secp256k1_fe_mul(&s1, &a->y, &z22); secp256k1_fe_mul(&s1, &s1, &b->z);
    secp256k1_fe_t s2; secp256k1_fe_mul(&s2, &b->y, &z12); secp256k1_fe_mul(&s2, &s2, &a->z);
-    secp256k1_fe_normalize(&u1);
-    secp256k1_fe_normalize(&u2);
+    secp256k1_fe_normalize_var(&u1);
+    secp256k1_fe_normalize_var(&u2);
    if (secp256k1_fe_equal(&u1, &u2)) {
-        secp256k1_fe_normalize(&s1);
-        secp256k1_fe_normalize(&s2);
+        secp256k1_fe_normalize_var(&s1);
+        secp256k1_fe_normalize_var(&s2);
        if (secp256k1_fe_equal(&s1, &s2)) {
            secp256k1_gej_double_var(r, a);
        } else {
@ -294,15 +300,14 @@ static void secp256k1_gej_add_ge_var(secp256k1_gej_t *r, const secp256k1_gej_t *
    }
    r->infinity = 0;
    secp256k1_fe_t z12; secp256k1_fe_sqr(&z12, &a->z);
-    secp256k1_fe_t u1 = a->x; secp256k1_fe_normalize(&u1);
+    secp256k1_fe_t u1 = a->x;
    secp256k1_fe_t u2; secp256k1_fe_mul(&u2, &b->x, &z12);
-    secp256k1_fe_t s1 = a->y; secp256k1_fe_normalize(&s1);
+    secp256k1_fe_t s1 = a->y; secp256k1_fe_normalize_var(&s1);
    secp256k1_fe_t s2; secp256k1_fe_mul(&s2, &b->y, &z12); secp256k1_fe_mul(&s2, &s2, &a->z);
-    secp256k1_fe_normalize(&u1);
-    secp256k1_fe_normalize(&u2);
+    secp256k1_fe_normalize_var(&u1);
+    secp256k1_fe_normalize_var(&u2);
    if (secp256k1_fe_equal(&u1, &u2)) {
-        secp256k1_fe_normalize(&s1);
-        secp256k1_fe_normalize(&s2);
+        secp256k1_fe_normalize_var(&s2);
        if (secp256k1_fe_equal(&s1, &s2)) {
            secp256k1_gej_double_var(r, a);
        } else {
@ -434,7 +439,7 @@ static void secp256k1_ge_start(void) {
    };
 #endif
    if (secp256k1_ge_consts == NULL) {
-        secp256k1_ge_consts_t *ret = (secp256k1_ge_consts_t*)malloc(sizeof(secp256k1_ge_consts_t));
+        secp256k1_ge_consts_t *ret = (secp256k1_ge_consts_t*)checked_malloc(sizeof(secp256k1_ge_consts_t));
 #ifdef USE_ENDOMORPHISM
        VERIFY_CHECK(secp256k1_fe_set_b32(&ret->beta, secp256k1_ge_consts_beta));
 #endif
--- a/src/scalar_impl.h
+++ b/src/scalar_impl.h
@ -40,7 +40,7 @@ static void secp256k1_scalar_start(void) {
        return;

    /* Allocate. */
-    secp256k1_scalar_consts_t *ret = (secp256k1_scalar_consts_t*)malloc(sizeof(secp256k1_scalar_consts_t));
+    secp256k1_scalar_consts_t *ret = (secp256k1_scalar_consts_t*)checked_malloc(sizeof(secp256k1_scalar_consts_t));

 #ifndef USE_NUM_NONE
    static const unsigned char secp256k1_scalar_consts_order[] = {
--- a/src/secp256k1.c
+++ b/src/secp256k1.c
@ -40,15 +40,12 @@ void secp256k1_stop(void) {
    secp256k1_fe_stop();
 }

-int secp256k1_ecdsa_verify(const unsigned char *msg, int msglen, const unsigned char *sig, int siglen, const unsigned char *pubkey, int pubkeylen) {
+int secp256k1_ecdsa_verify(const unsigned char *msg32, const unsigned char *sig, int siglen, const unsigned char *pubkey, int pubkeylen) {
    DEBUG_CHECK(secp256k1_ecmult_consts != NULL);
-    DEBUG_CHECK(msg != NULL);
-    DEBUG_CHECK(msglen <= 32);
+    DEBUG_CHECK(msg32 != NULL);
    DEBUG_CHECK(sig != NULL);
    DEBUG_CHECK(pubkey != NULL);

-    unsigned char msg32[32] = {0};
-    memcpy(msg32 + 32 - msglen, msg, msglen);
    int ret = -3;
    secp256k1_scalar_t m;
    secp256k1_ecdsa_sig_t s;
@ -72,10 +69,9 @@ end:
    return ret;
 }

-int secp256k1_ecdsa_sign(const unsigned char *message, int messagelen, unsigned char *signature, int *signaturelen, const unsigned char *seckey, const unsigned char *nonce) {
+int secp256k1_ecdsa_sign(const unsigned char *msg32, unsigned char *signature, int *signaturelen, const unsigned char *seckey, const unsigned char *nonce) {
    DEBUG_CHECK(secp256k1_ecmult_gen_consts != NULL);
-    DEBUG_CHECK(message != NULL);
-    DEBUG_CHECK(messagelen <= 32);
+    DEBUG_CHECK(msg32 != NULL);
    DEBUG_CHECK(signature != NULL);
    DEBUG_CHECK(signaturelen != NULL);
    DEBUG_CHECK(seckey != NULL);
@ -85,12 +81,7 @@ int secp256k1_ecdsa_sign(const unsigned char *message, int messagelen, unsigned
    secp256k1_scalar_set_b32(&sec, seckey, NULL);
    int overflow = 0;
    secp256k1_scalar_set_b32(&non, nonce, &overflow);
-    {
-        unsigned char c[32] = {0};
-        memcpy(c + 32 - messagelen, message, messagelen);
-        secp256k1_scalar_set_b32(&msg, c, NULL);
-        memset(c, 0, 32);
-    }
+    secp256k1_scalar_set_b32(&msg, msg32, NULL);
    int ret = !secp256k1_scalar_is_zero(&non) && !overflow;
    secp256k1_ecdsa_sig_t sig;
    if (ret) {
@ -105,10 +96,9 @@ int secp256k1_ecdsa_sign(const unsigned char *message, int messagelen, unsigned
    return ret;
 }

-int secp256k1_ecdsa_sign_compact(const unsigned char *message, int messagelen, unsigned char *sig64, const unsigned char *seckey, const unsigned char *nonce, int *recid) {
+int secp256k1_ecdsa_sign_compact(const unsigned char *msg32, unsigned char *sig64, const unsigned char *seckey, const unsigned char *nonce, int *recid) {
    DEBUG_CHECK(secp256k1_ecmult_gen_consts != NULL);
-    DEBUG_CHECK(message != NULL);
-    DEBUG_CHECK(messagelen <= 32);
+    DEBUG_CHECK(msg32 != NULL);
    DEBUG_CHECK(sig64 != NULL);
    DEBUG_CHECK(seckey != NULL);
    DEBUG_CHECK(nonce != NULL);
@ -117,12 +107,7 @@ int secp256k1_ecdsa_sign_compact(const unsigned char *message, int messagelen, u
    secp256k1_scalar_set_b32(&sec, seckey, NULL);
    int overflow = 0;
    secp256k1_scalar_set_b32(&non, nonce, &overflow);
-    {
-        unsigned char c[32] = {0};
-        memcpy(c + 32 - messagelen, message, messagelen);
-        secp256k1_scalar_set_b32(&msg, c, NULL);
-        memset(c, 0, 32);
-    }
+    secp256k1_scalar_set_b32(&msg, msg32, NULL);
    int ret = !secp256k1_scalar_is_zero(&non) && !overflow;
    secp256k1_ecdsa_sig_t sig;
    if (ret) {
@ -138,18 +123,15 @@ int secp256k1_ecdsa_sign_compact(const unsigned char *message, int messagelen, u
    return ret;
 }

-int secp256k1_ecdsa_recover_compact(const unsigned char *msg, int msglen, const unsigned char *sig64, unsigned char *pubkey, int *pubkeylen, int compressed, int recid) {
+int secp256k1_ecdsa_recover_compact(const unsigned char *msg32, const unsigned char *sig64, unsigned char *pubkey, int *pubkeylen, int compressed, int recid) {
    DEBUG_CHECK(secp256k1_ecmult_consts != NULL);
-    DEBUG_CHECK(msg != NULL);
-    DEBUG_CHECK(msglen <= 32);
+    DEBUG_CHECK(msg32 != NULL);
    DEBUG_CHECK(sig64 != NULL);
    DEBUG_CHECK(pubkey != NULL);
    DEBUG_CHECK(pubkeylen != NULL);
    DEBUG_CHECK(recid >= 0 && recid <= 3);

    int ret = 0;
-    unsigned char msg32[32] = {0};
-    memcpy(msg32 + 32 - msglen, msg, msglen);
    secp256k1_scalar_t m;
    secp256k1_ecdsa_sig_t sig;
    int overflow = 0;
--- a/src/tests.c
+++ b/src/tests.c
@ -11,6 +11,8 @@
 #include <stdio.h>
 #include <stdlib.h>

+#include <time.h>
+
 #include "secp256k1.c"
 #include "testrand_impl.h"

@ -46,7 +48,7 @@ void random_group_element_test(secp256k1_ge_t *ge) {
    secp256k1_fe_t fe;
    do {
        random_field_element_test(&fe);
-        if (secp256k1_ge_set_xo(ge, &fe, secp256k1_rand32() & 1))
+        if (secp256k1_ge_set_xo_var(ge, &fe, secp256k1_rand32() & 1))
            break;
    } while(1);
 }
@ -400,6 +402,30 @@ void scalar_test(void) {
        CHECK(secp256k1_scalar_eq(&r1, &r2));
    }

+    {
+        /* Test multiplicative identity. */
+        secp256k1_scalar_t r1, v1;
+        secp256k1_scalar_set_int(&v1,1);
+        secp256k1_scalar_mul(&r1, &s1, &v1);
+        CHECK(secp256k1_scalar_eq(&r1, &s1));
+    }
+
+    {
+        /* Test additive identity. */
+        secp256k1_scalar_t r1, v0;
+        secp256k1_scalar_set_int(&v0,0);
+        secp256k1_scalar_add(&r1, &s1, &v0);
+        CHECK(secp256k1_scalar_eq(&r1, &s1));
+    }
+
+    {
+        /* Test zero product property. */
+        secp256k1_scalar_t r1, v0;
+        secp256k1_scalar_set_int(&v0,0);
+        secp256k1_scalar_mul(&r1, &s1, &v0);
+        CHECK(secp256k1_scalar_eq(&r1, &v0));
+    }
+
 }

 void run_scalar_tests(void) {
@ -411,9 +437,12 @@ void run_scalar_tests(void) {
        /* (-1)+1 should be zero. */
        secp256k1_scalar_t s, o;
        secp256k1_scalar_set_int(&s, 1);
+        CHECK(secp256k1_scalar_is_one(&s));
        secp256k1_scalar_negate(&o, &s);
        secp256k1_scalar_add(&o, &o, &s);
        CHECK(secp256k1_scalar_is_zero(&o));
+        secp256k1_scalar_negate(&o, &o);
+        CHECK(secp256k1_scalar_is_zero(&o));
    }

 #ifndef USE_NUM_NONE
@ -459,14 +488,14 @@ void random_fe_non_zero(secp256k1_fe_t *nz) {
 void random_fe_non_square(secp256k1_fe_t *ns) {
    random_fe_non_zero(ns);
    secp256k1_fe_t r;
-    if (secp256k1_fe_sqrt(&r, ns)) {
+    if (secp256k1_fe_sqrt_var(&r, ns)) {
        secp256k1_fe_negate(ns, ns, 1);
    }
 }

 int check_fe_equal(const secp256k1_fe_t *a, const secp256k1_fe_t *b) {
    secp256k1_fe_t an = *a; secp256k1_fe_normalize(&an);
-    secp256k1_fe_t bn = *b; secp256k1_fe_normalize(&bn);
+    secp256k1_fe_t bn = *b; secp256k1_fe_normalize_var(&bn);
    return secp256k1_fe_equal(&an, &bn);
 }

@ -476,6 +505,55 @@ int check_fe_inverse(const secp256k1_fe_t *a, const secp256k1_fe_t *ai) {
    return check_fe_equal(&x, &one);
 }

+void run_field_misc(void) {
+    const unsigned char f32_5[32] = {
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05,
+    };
+    secp256k1_fe_t x;
+    secp256k1_fe_t y;
+    secp256k1_fe_t z;
+    secp256k1_fe_t q;
+    secp256k1_fe_t fe5;
+    CHECK(secp256k1_fe_set_b32(&fe5, f32_5));
+    for (int i=0; i<5*count; i++) {
+        random_fe(&x);
+        random_fe_non_zero(&y);
+        /* Test the fe equality and comparison operations. */
+        CHECK(secp256k1_fe_cmp_var(&x, &x) == 0);
+        CHECK(secp256k1_fe_equal(&x, &x));
+        z = x;
+        secp256k1_fe_add(&z,&y);
+        secp256k1_fe_normalize(&z);
+        /* Test the conditional move. */
+        secp256k1_fe_cmov(&z, &x, 0);
+        CHECK(secp256k1_fe_equal(&x, &z) == 0);
+        CHECK(secp256k1_fe_cmp_var(&x, &z) != 0);
+        secp256k1_fe_cmov(&y, &x, 1);
+        CHECK(secp256k1_fe_equal(&x, &y));
+        /* Test that mul_int, mul, and add agree. */
+        secp256k1_fe_add(&y, &x);
+        secp256k1_fe_add(&y, &x);
+        z = x;
+        secp256k1_fe_mul_int(&z, 3);
+        CHECK(check_fe_equal(&y, &z));
+        secp256k1_fe_add(&y, &x);
+        secp256k1_fe_add(&z, &x);
+        CHECK(check_fe_equal(&z, &y));
+        z = x;
+        secp256k1_fe_mul_int(&z, 5);
+        secp256k1_fe_mul(&q, &x, &fe5);
+        CHECK(check_fe_equal(&z, &q));
+        secp256k1_fe_negate(&x, &x, 1);
+        secp256k1_fe_add(&z, &x);
+        secp256k1_fe_add(&q, &x);
+        CHECK(check_fe_equal(&y, &z));
+        CHECK(check_fe_equal(&q, &y));
+    }
+}
+
 void run_field_inv(void) {
    secp256k1_fe_t x, xi, xii;
    for (int i=0; i<10*count; i++) {
@ -498,23 +576,6 @@ void run_field_inv_var(void) {
    }
 }

-void run_field_inv_all(void) {
-    secp256k1_fe_t x[16], xi[16], xii[16];
-    /* Check it's safe to call for 0 elements */
-    secp256k1_fe_inv_all(0, xi, x);
-    for (int i=0; i<count; i++) {
-        size_t len = (secp256k1_rand32() & 15) + 1;
-        for (size_t j=0; j<len; j++)
-            random_fe_non_zero(&x[j]);
-        secp256k1_fe_inv_all(len, xi, x);
-        for (size_t j=0; j<len; j++)
-            CHECK(check_fe_inverse(&x[j], &xi[j]));
-        secp256k1_fe_inv_all(len, xii, xi);
-        for (size_t j=0; j<len; j++)
-            CHECK(check_fe_equal(&x[j], &xii[j]));
-    }
-}
-
 void run_field_inv_all_var(void) {
    secp256k1_fe_t x[16], xi[16], xii[16];
    /* Check it's safe to call for 0 elements */
@ -549,7 +610,7 @@ void run_sqr(void) {

 void test_sqrt(const secp256k1_fe_t *a, const secp256k1_fe_t *k) {
    secp256k1_fe_t r1, r2;
-    int v = secp256k1_fe_sqrt(&r1, a);
+    int v = secp256k1_fe_sqrt_var(&r1, a);
    CHECK((v == 0) == (k == NULL));

    if (k != NULL) {
@ -769,6 +830,7 @@ void run_ecmult_chain(void) {
 }

 void test_point_times_order(const secp256k1_gej_t *point) {
+    unsigned char pub[65];
    /* X * (point + G) + (order-X) * (pointer + G) = 0 */
    secp256k1_scalar_t x;
    random_scalar_order_test(&x);
@ -779,27 +841,36 @@ void test_point_times_order(const secp256k1_gej_t *point) {
    secp256k1_ecmult(&res2, point, &nx, &nx); /* calc res2 = (order - x) * point + (order - x) * G; */
    secp256k1_gej_add_var(&res1, &res1, &res2);
    CHECK(secp256k1_gej_is_infinity(&res1));
-    CHECK(secp256k1_gej_is_valid(&res1) == 0);
+    CHECK(secp256k1_gej_is_valid_var(&res1) == 0);
    secp256k1_ge_t res3;
    secp256k1_ge_set_gej(&res3, &res1);
    CHECK(secp256k1_ge_is_infinity(&res3));
-    CHECK(secp256k1_ge_is_valid(&res3) == 0);
+    CHECK(secp256k1_ge_is_valid_var(&res3) == 0);
+    int psize = 65;
+    CHECK(secp256k1_eckey_pubkey_serialize(&res3, pub, &psize, 0) == 0);
+    psize = 65;
+    CHECK(secp256k1_eckey_pubkey_serialize(&res3, pub, &psize, 1) == 0);
 }

 void run_point_times_order(void) {
    secp256k1_fe_t x; VERIFY_CHECK(secp256k1_fe_set_hex(&x, "02", 2));
    for (int i=0; i<500; i++) {
        secp256k1_ge_t p;
-        if (secp256k1_ge_set_xo(&p, &x, 1)) {
-            CHECK(secp256k1_ge_is_valid(&p));
+        if (secp256k1_ge_set_xo_var(&p, &x, 1)) {
+            CHECK(secp256k1_ge_is_valid_var(&p));
            secp256k1_gej_t j;
            secp256k1_gej_set_ge(&j, &p);
-            CHECK(secp256k1_gej_is_valid(&j));
+            CHECK(secp256k1_gej_is_valid_var(&j));
            test_point_times_order(&j);
        }
        secp256k1_fe_sqr(&x, &x);
    }
-    char c[65]; int cl=65;
+    char c[65];
+    int cl = 1;
+    c[1] = 123;
+    secp256k1_fe_get_hex(c, &cl, &x); /* Check that fe_get_hex handles a too short input. */
+    CHECK(c[1] == 123);
+    cl = 65;
    secp256k1_fe_get_hex(c, &cl, &x);
    CHECK(strcmp(c, "7603CB59B0EF6C63FE6084792A0C378CDB3233A80F8A9A09A877DEAD31B38C45") == 0);
 }
@ -894,7 +965,10 @@ void test_ecdsa_end_to_end(void) {
    /* Construct and verify corresponding public key. */
    CHECK(secp256k1_ec_seckey_verify(privkey) == 1);
    unsigned char pubkey[65]; int pubkeylen = 65;
-    CHECK(secp256k1_ec_pubkey_create(pubkey, &pubkeylen, privkey, secp256k1_rand32() % 2) == 1);
+    CHECK(secp256k1_ec_pubkey_create(pubkey, &pubkeylen, privkey, (secp256k1_rand32() & 3) != 0) == 1);
+    if (secp256k1_rand32() & 1) {
+        CHECK(secp256k1_ec_pubkey_decompress(pubkey, &pubkeylen));
+    }
    CHECK(secp256k1_ec_pubkey_verify(pubkey, pubkeylen));

    /* Verify private key import and export. */
@ -935,38 +1009,96 @@ void test_ecdsa_end_to_end(void) {
    while(1) {
        unsigned char rnd[32];
        secp256k1_rand256_test(rnd);
-        if (secp256k1_ecdsa_sign(message, 32, signature, &signaturelen, privkey, rnd) == 1) {
+        if (secp256k1_ecdsa_sign(message, signature, &signaturelen, privkey, rnd) == 1) {
            break;
        }
    }
    /* Verify. */
-    CHECK(secp256k1_ecdsa_verify(message, 32, signature, signaturelen, pubkey, pubkeylen) == 1);
+    CHECK(secp256k1_ecdsa_verify(message, signature, signaturelen, pubkey, pubkeylen) == 1);
    /* Destroy signature and verify again. */
    signature[signaturelen - 1 - secp256k1_rand32() % 20] += 1 + (secp256k1_rand32() % 255);
-    CHECK(secp256k1_ecdsa_verify(message, 32, signature, signaturelen, pubkey, pubkeylen) != 1);
+    CHECK(secp256k1_ecdsa_verify(message, signature, signaturelen, pubkey, pubkeylen) != 1);

    /* Compact sign. */
    unsigned char csignature[64]; int recid = 0;
    while(1) {
        unsigned char rnd[32];
        secp256k1_rand256_test(rnd);
-        if (secp256k1_ecdsa_sign_compact(message, 32, csignature, privkey, rnd, &recid) == 1) {
+        if (secp256k1_ecdsa_sign_compact(message, csignature, privkey, rnd, &recid) == 1) {
            break;
        }
    }
    /* Recover. */
    unsigned char recpubkey[65]; int recpubkeylen = 0;
-    CHECK(secp256k1_ecdsa_recover_compact(message, 32, csignature, recpubkey, &recpubkeylen, pubkeylen == 33, recid) == 1);
+    CHECK(secp256k1_ecdsa_recover_compact(message, csignature, recpubkey, &recpubkeylen, pubkeylen == 33, recid) == 1);
    CHECK(recpubkeylen == pubkeylen);
    CHECK(memcmp(pubkey, recpubkey, pubkeylen) == 0);
    /* Destroy signature and verify again. */
    csignature[secp256k1_rand32() % 64] += 1 + (secp256k1_rand32() % 255);
-    CHECK(secp256k1_ecdsa_recover_compact(message, 32, csignature, recpubkey, &recpubkeylen, pubkeylen == 33, recid) != 1 ||
+    CHECK(secp256k1_ecdsa_recover_compact(message, csignature, recpubkey, &recpubkeylen, pubkeylen == 33, recid) != 1 ||
          memcmp(pubkey, recpubkey, pubkeylen) != 0);
    CHECK(recpubkeylen == pubkeylen);

 }

+void test_random_pubkeys(void) {
+    unsigned char in[65];
+    /* Generate some randomly sized pubkeys. */
+    uint32_t r = secp256k1_rand32();
+    int len = (r & 3) == 0 ? 65 : 33;
+    r>>=2;
+    if ((r & 3) == 0) len = (r & 252) >> 3;
+    r>>=8;
+    if (len == 65) {
+      in[0] = (r & 2) ? 4 : (r & 1? 6 : 7);
+    } else {
+      in[0] = (r & 1) ? 2 : 3;
+    }
+    r>>=2;
+    if ((r & 7) == 0) in[0] = (r & 2040) >> 3;
+    r>>=11;
+    if (len > 1) secp256k1_rand256(&in[1]);
+    if (len > 33) secp256k1_rand256(&in[33]);
+    secp256k1_ge_t elem;
+    secp256k1_ge_t elem2;
+    if (secp256k1_eckey_pubkey_parse(&elem, in, len)) {
+        unsigned char out[65];
+        unsigned char firstb;
+        int res;
+        int size = len;
+        firstb = in[0];
+        /* If the pubkey can be parsed, it should round-trip... */
+        CHECK(secp256k1_eckey_pubkey_serialize(&elem, out, &size, len == 33));
+        CHECK(size == len);
+        CHECK(memcmp(&in[1], &out[1], len-1) == 0);
+        /* ... except for the type of hybrid inputs. */
+        if ((in[0] != 6) && (in[0] != 7)) CHECK(in[0] == out[0]);
+        size = 65;
+        CHECK(secp256k1_eckey_pubkey_serialize(&elem, in, &size, 0));
+        CHECK(size == 65);
+        CHECK(secp256k1_eckey_pubkey_parse(&elem2, in, size));
+        CHECK(ge_equals_ge(&elem,&elem2));
+        /* Check that the X9.62 hybrid type is checked. */
+        in[0] = (r & 1) ? 6 : 7;
+        res = secp256k1_eckey_pubkey_parse(&elem2, in, size);
+        if (firstb == 2 || firstb == 3) {
+            if (in[0] == firstb + 4) CHECK(res);
+            else CHECK(!res);
+        }
+        if (res) {
+            CHECK(ge_equals_ge(&elem,&elem2));
+            CHECK(secp256k1_eckey_pubkey_serialize(&elem, out, &size, 0));
+            CHECK(memcmp(&in[1], &out[1], 64) == 0);
+        }
+    }
+}
+
+void run_random_pubkeys(void) {
+    for (int i=0; i<10*count; i++) {
+        test_random_pubkeys();
+    }
+}
+
 void run_ecdsa_end_to_end(void) {
    for (int i=0; i<64*count; i++) {
        test_ecdsa_end_to_end();
@ -995,10 +1127,10 @@ void test_ecdsa_edge_cases(void) {
    };
    unsigned char pubkey[65];
    int pubkeylen = 65;
-    CHECK(!secp256k1_ecdsa_recover_compact(msg32, 32, sig64, pubkey, &pubkeylen, 0, 0));
-    CHECK(secp256k1_ecdsa_recover_compact(msg32, 32, sig64, pubkey, &pubkeylen, 0, 1));
-    CHECK(!secp256k1_ecdsa_recover_compact(msg32, 32, sig64, pubkey, &pubkeylen, 0, 2));
-    CHECK(!secp256k1_ecdsa_recover_compact(msg32, 32, sig64, pubkey, &pubkeylen, 0, 3));
+    CHECK(!secp256k1_ecdsa_recover_compact(msg32, sig64, pubkey, &pubkeylen, 0, 0));
+    CHECK(secp256k1_ecdsa_recover_compact(msg32, sig64, pubkey, &pubkeylen, 0, 1));
+    CHECK(!secp256k1_ecdsa_recover_compact(msg32, sig64, pubkey, &pubkeylen, 0, 2));
+    CHECK(!secp256k1_ecdsa_recover_compact(msg32, sig64, pubkey, &pubkeylen, 0, 3));

    /* signature (r,s) = (4,4), which can be recovered with all 4 recids. */
    const unsigned char sigb64[64] = {
@ -1016,6 +1148,36 @@ void test_ecdsa_edge_cases(void) {
    for (int recid = 0; recid < 4; recid++) {
        /* (4,4) encoded in DER. */
        unsigned char sigbder[8] = {0x30, 0x06, 0x02, 0x01, 0x04, 0x02, 0x01, 0x04};
+        unsigned char sigcder_zr[7] = {0x30, 0x05, 0x02, 0x00, 0x02, 0x01, 0x01};
+        unsigned char sigcder_zs[7] = {0x30, 0x05, 0x02, 0x01, 0x01, 0x02, 0x00};
+        unsigned char sigbderalt1[39] = {
+            0x30, 0x25, 0x02, 0x20, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x04, 0x02, 0x01, 0x04,
+        };
+        unsigned char sigbderalt2[39] = {
+            0x30, 0x25, 0x02, 0x01, 0x04, 0x02, 0x20, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04,
+        };
+        unsigned char sigbderalt3[40] = {
+            0x30, 0x26, 0x02, 0x21, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x04, 0x02, 0x01, 0x04,
+        };
+        unsigned char sigbderalt4[40] = {
+            0x30, 0x26, 0x02, 0x01, 0x04, 0x02, 0x21, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04,
+        };
        /* (order + r,4) encoded in DER. */
        unsigned char sigbderlong[40] = {
            0x30, 0x26, 0x02, 0x21, 0x00, 0xFF, 0xFF, 0xFF,
@ -1024,18 +1186,45 @@ void test_ecdsa_edge_cases(void) {
            0xE6, 0xAF, 0x48, 0xA0, 0x3B, 0xBF, 0xD2, 0x5E,
            0x8C, 0xD0, 0x36, 0x41, 0x45, 0x02, 0x01, 0x04
        };
-        CHECK(secp256k1_ecdsa_recover_compact(msg32, 32, sigb64, pubkeyb, &pubkeyblen, 1, recid));
-        CHECK(secp256k1_ecdsa_verify(msg32, 32, sigbder, sizeof(sigbder), pubkeyb, pubkeyblen) == 1);
+        CHECK(secp256k1_ecdsa_recover_compact(msg32, sigb64, pubkeyb, &pubkeyblen, 1, recid));
+        CHECK(secp256k1_ecdsa_verify(msg32, sigbder, sizeof(sigbder), pubkeyb, pubkeyblen) == 1);
        for (int recid2 = 0; recid2 < 4; recid2++) {
            unsigned char pubkey2b[33];
            int pubkey2blen = 33;
-            CHECK(secp256k1_ecdsa_recover_compact(msg32, 32, sigb64, pubkey2b, &pubkey2blen, 1, recid2));
+            CHECK(secp256k1_ecdsa_recover_compact(msg32, sigb64, pubkey2b, &pubkey2blen, 1, recid2));
            /* Verifying with (order + r,4) should always fail. */
-            CHECK(secp256k1_ecdsa_verify(msg32, 32, sigbderlong, sizeof(sigbderlong), pubkey2b, pubkey2blen) != 1);
+            CHECK(secp256k1_ecdsa_verify(msg32, sigbderlong, sizeof(sigbderlong), pubkey2b, pubkey2blen) != 1);
        }
+        /* DER parsing tests. */
+        /* Zero length r/s. */
+        CHECK(secp256k1_ecdsa_verify(msg32, sigcder_zr, sizeof(sigcder_zr), pubkeyb, pubkeyblen) == -2);
+        CHECK(secp256k1_ecdsa_verify(msg32, sigcder_zs, sizeof(sigcder_zs), pubkeyb, pubkeyblen) == -2);
+        /* Leading zeros. */
+        CHECK(secp256k1_ecdsa_verify(msg32, sigbderalt1, sizeof(sigbderalt1), pubkeyb, pubkeyblen) == 1);
+        CHECK(secp256k1_ecdsa_verify(msg32, sigbderalt2, sizeof(sigbderalt2), pubkeyb, pubkeyblen) == 1);
+        CHECK(secp256k1_ecdsa_verify(msg32, sigbderalt3, sizeof(sigbderalt3), pubkeyb, pubkeyblen) == 1);
+        CHECK(secp256k1_ecdsa_verify(msg32, sigbderalt4, sizeof(sigbderalt4), pubkeyb, pubkeyblen) == 1);
+        sigbderalt3[4] = 1;
+        CHECK(secp256k1_ecdsa_verify(msg32, sigbderalt3, sizeof(sigbderalt3), pubkeyb, pubkeyblen) == -2);
+        sigbderalt4[7] = 1;
+        CHECK(secp256k1_ecdsa_verify(msg32, sigbderalt4, sizeof(sigbderalt4), pubkeyb, pubkeyblen) == -2);
        /* Damage signature. */
        sigbder[7]++;
-        CHECK(secp256k1_ecdsa_verify(msg32, 32, sigbder, sizeof(sigbder), pubkeyb, pubkeyblen) == 0);
+        CHECK(secp256k1_ecdsa_verify(msg32, sigbder, sizeof(sigbder), pubkeyb, pubkeyblen) == 0);
+        sigbder[7]--;
+        CHECK(secp256k1_ecdsa_verify(msg32, sigbder, 6, pubkeyb, pubkeyblen) == -2);
+        CHECK(secp256k1_ecdsa_verify(msg32, sigbder, sizeof(sigbder)-1, pubkeyb, pubkeyblen) == -2);
+        for(int i = 0; i<8; i++) {
+            unsigned char orig = sigbder[i];
+            /*Try every single-byte change.*/
+            for (int c=0; c<256; c++) {
+                if (c == orig ) continue;
+                sigbder[i] = c;
+                CHECK(secp256k1_ecdsa_verify(msg32, sigbder, sizeof(sigbder), pubkeyb, pubkeyblen) ==
+                  (i==4 || i==7) ? 0 : -2 );
+            }
+            sigbder[i] = orig;
+        }
    }

    /* Test the case where ECDSA recomputes a point that is infinity. */
@ -1069,18 +1258,60 @@ void test_ecdsa_edge_cases(void) {
        };
        unsigned char pubkeyc[65];
        int pubkeyclen = 65;
-        CHECK(secp256k1_ecdsa_recover_compact(msg32, 32, sigc64, pubkeyc, &pubkeyclen, 0, 0) == 1);
-        CHECK(secp256k1_ecdsa_verify(msg32, 32, sigcder, sizeof(sigcder), pubkeyc, pubkeyclen) == 1);
+        CHECK(secp256k1_ecdsa_recover_compact(msg32, sigc64, pubkeyc, &pubkeyclen, 0, 0) == 1);
+        CHECK(secp256k1_ecdsa_verify(msg32, sigcder, sizeof(sigcder), pubkeyc, pubkeyclen) == 1);
        sigcder[4] = 0;
        sigc64[31] = 0;
-        CHECK(secp256k1_ecdsa_recover_compact(msg32, 32, sigc64, pubkeyb, &pubkeyblen, 1, 0) == 0);
-        CHECK(secp256k1_ecdsa_verify(msg32, 32, sigcder, sizeof(sigcder), pubkeyc, pubkeyclen) == 0);
+        CHECK(secp256k1_ecdsa_recover_compact(msg32, sigc64, pubkeyb, &pubkeyblen, 1, 0) == 0);
+        CHECK(secp256k1_ecdsa_verify(msg32, sigcder, sizeof(sigcder), pubkeyc, pubkeyclen) == 0);
        sigcder[4] = 1;
        sigcder[7] = 0;
        sigc64[31] = 1;
        sigc64[63] = 0;
-        CHECK(secp256k1_ecdsa_recover_compact(msg32, 32, sigc64, pubkeyb, &pubkeyblen, 1, 0) == 0);
-        CHECK(secp256k1_ecdsa_verify(msg32, 32, sigcder, sizeof(sigcder), pubkeyc, pubkeyclen) == 0);
+        CHECK(secp256k1_ecdsa_recover_compact(msg32, sigc64, pubkeyb, &pubkeyblen, 1, 0) == 0);
+        CHECK(secp256k1_ecdsa_verify(msg32, sigcder, sizeof(sigcder), pubkeyc, pubkeyclen) == 0);
+    }
+
+    /*Signature where s would be zero.*/
+    {
+        const unsigned char nonce[32] = {
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+        };
+        const unsigned char key[32] = {
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+        };
+        unsigned char msg[32] = {
+            0x86, 0x41, 0x99, 0x81, 0x06, 0x23, 0x44, 0x53,
+            0xaa, 0x5f, 0x9d, 0x6a, 0x31, 0x78, 0xf4, 0xf7,
+            0xb8, 0x12, 0xe0, 0x0b, 0x81, 0x7a, 0x77, 0x62,
+            0x65, 0xdf, 0xdd, 0x31, 0xb9, 0x3e, 0x29, 0xa9,
+        };
+        unsigned char sig[72];
+        int siglen = 72;
+        CHECK(secp256k1_ecdsa_sign(msg, sig, &siglen, key, nonce) == 0);
+        msg[31] = 0xaa;
+        siglen = 72;
+        CHECK(secp256k1_ecdsa_sign(msg, sig, &siglen, key, nonce) == 1);
+    }
+
+    /* Privkey export where pubkey is the point at infinity. */
+    {
+        unsigned char privkey[300];
+        unsigned char seckey[32] = {
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe,
+            0xba, 0xae, 0xdc, 0xe6, 0xaf, 0x48, 0xa0, 0x3b,
+            0xbf, 0xd2, 0x5e, 0x8c, 0xd0, 0x36, 0x41, 0x41,
+        };
+        int outlen = 300;
+        CHECK(!secp256k1_ec_privkey_export(seckey, privkey, &outlen, 0));
+        CHECK(!secp256k1_ec_privkey_export(seckey, privkey, &outlen, 1));
    }
 }

@ -1185,8 +1416,8 @@ int main(int argc, char **argv) {
    /* field tests */
    run_field_inv();
    run_field_inv_var();
-    run_field_inv_all();
    run_field_inv_all_var();
+    run_field_misc();
    run_sqr();
    run_sqrt();

@ -1199,6 +1430,7 @@ int main(int argc, char **argv) {
    run_ecmult_chain();

    /* ecdsa tests */
+    run_random_pubkeys();
    run_ecdsa_sign_verify();
    run_ecdsa_end_to_end();
    run_ecdsa_edge_cases();
--- a/src/util.h
+++ b/src/util.h
@ -61,6 +61,12 @@
 #define VERIFY_CHECK(cond) do { (void)(cond); } while(0)
 #endif

+static inline void *checked_malloc(size_t size) {
+    void *ret = malloc(size);
+    CHECK(ret != NULL);
+    return ret;
+}
+
 /* Macro for restrict, when available and not in a VERIFY build. */
 #if defined(SECP256K1_BUILD) && defined(VERIFY)
 # define SECP256K1_RESTRICT