split from mk3

2021-06-17 10:36:09 -04:00 · 2021-06-17 10:36:09 -04:00 · c82412489f
commit c82412489f
parent b733614146
15 changed files with 28864 additions and 1 deletions
--- a/stm32/mk4-bootloader/micro-ecc
+++ b/stm32/mk4-bootloader/micro-ecc
@ -1 +0,0 @@
-../bootloader/micro-ecc
--- a/stm32/mk4-bootloader/micro-ecc/LICENSE.txt
+++ b/stm32/mk4-bootloader/micro-ecc/LICENSE.txt
@ -0,0 +1,21 @@
+Copyright (c) 2014, Kenneth MacKay
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/stm32/mk4-bootloader/micro-ecc/README.md
+++ b/stm32/mk4-bootloader/micro-ecc/README.md
@ -0,0 +1,41 @@
+micro-ecc
+==========
+
+A small and fast ECDH and ECDSA implementation for 8-bit, 32-bit, and 64-bit processors.
+
+The static version of micro-ecc (ie, where the curve was selected at compile-time) can be found in the "static" branch.
+
+Features
+--------
+
+ * Resistant to known side-channel attacks.
+ * Written in C, with optional GCC inline assembly for AVR, ARM and Thumb platforms.
+ * Supports 8, 32, and 64-bit architectures.
+ * Small code size.
+ * No dynamic memory allocation.
+ * Support for 5 standard curves: secp160r1, secp192r1, secp224r1, secp256r1, and secp256k1.
+ * BSD 2-clause license.
+
+Usage Notes
+-----------
+### Point Representation ###
+Compressed points are represented in the standard format as defined in http://www.secg.org/collateral/sec1_final.pdf; uncompressed points are represented in standard format, but without the `0x04` prefix. All functions except `uECC_compress()` only accept uncompressed points; use `uECC_compress()` and `uECC_decompress()` to convert between compressed and uncompressed point representations.
+
+Private keys are represented in the standard format.
+
+### Using the Code ###
+
+I recommend just copying (or symlink) the uECC files into your project. Then just `#include "uECC.h"` to use the micro-ecc functions.
+
+For use with Arduino, you can just create a symlink to the `uECC` directory in your Arduino `libraries` directory. You can then use uECC just like any other Arduino library (uECC should show up in the **Sketch**=>**Import Library** submenu).
+
+See uECC.h for documentation for each function.
+
+### Compilation Notes ###
+
+ * Should compile with any C/C++ compiler that supports stdint.h (this includes Visual Studio 2013).
+ * If you want to change the defaults for any of the uECC compile-time options (such as `uECC_OPTIMIZATION_LEVEL`), you must change them in your Makefile or similar so that uECC.c is compiled with the desired values (ie, compile uECC.c with `-DuECC_OPTIMIZATION_LEVEL=3` or whatever).
+ * When compiling for a Thumb-1 platform, you must use the `-fomit-frame-pointer` GCC option (this is enabled by default when compiling with `-O1` or higher).
+ * When compiling for an ARM/Thumb-2 platform with `uECC_OPTIMIZATION_LEVEL` >= 3, you must use the `-fomit-frame-pointer` GCC option (this is enabled by default when compiling with `-O1` or higher).
+ * When compiling for AVR, you must have optimizations enabled (compile with `-O1` or higher).
+ * When building for Windows, you will need to link in the `advapi32.lib` system library.
--- a/stm32/mk4-bootloader/micro-ecc/asm_arm.inc
+++ b/stm32/mk4-bootloader/micro-ecc/asm_arm.inc
--- a/stm32/mk4-bootloader/micro-ecc/asm_arm_mult_square.inc
+++ b/stm32/mk4-bootloader/micro-ecc/asm_arm_mult_square.inc
--- a/stm32/mk4-bootloader/micro-ecc/asm_avr.inc
+++ b/stm32/mk4-bootloader/micro-ecc/asm_avr.inc
@ -0,0 +1,960 @@
+/* Copyright 2015, Kenneth MacKay. Licensed under the BSD 2-clause license. */
+
+#ifndef _UECC_ASM_AVR_H_
+#define _UECC_ASM_AVR_H_
+
+#if __AVR_HAVE_EIJMP_EICALL__
+    #define IJMP "eijmp \n\t"
+#else
+    #define IJMP "ijmp \n\t"
+#endif
+
+#if (uECC_OPTIMIZATION_LEVEL >= 2)
+
+uECC_VLI_API void uECC_vli_clear(uECC_word_t *vli, wordcount_t num_words) {
+    volatile uECC_word_t *v = vli;
+    __asm__ volatile (
+    #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
+        "ldi r30, pm_lo8(1f) \n\t"
+        "ldi r31, pm_hi8(1f) \n\t"
+        "sub r30, %[num] \n\t"
+        "sbc r31, __zero_reg__ \n\t"
+        IJMP
+    #endif
+        
+        REPEAT(uECC_MAX_WORDS, "st x+, __zero_reg__ \n\t")
+        "1: \n\t"
+        : "+x" (v)
+        : [num] "r" (num_words)
+        : 
+    #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
+          "r30", "r31", "cc"
+    #endif
+    );
+}
+#define asm_clear 1
+
+uECC_VLI_API void uECC_vli_set(uECC_word_t *dest, const uECC_word_t *src, wordcount_t num_words) {
+    volatile uECC_word_t *d = dest;
+    __asm__ volatile (
+    #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
+        "ldi r30, pm_lo8(1f) \n\t"
+        "ldi r31, pm_hi8(1f) \n\t"
+        "sub r30, %[num] \n\t"
+        "sbc r31, __zero_reg__ \n\t"
+        IJMP
+    #endif
+        
+        REPEAT(uECC_MAX_WORDS,
+            "ld r0, y+ \n\t"
+            "st x+, r0 \n\t")
+        "1: \n\t"
+        : "+x" (d), "+y" (src)
+        : [num] "r" ((uint8_t)(num_words * 2))
+        : "r0",
+    #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
+          "r30", "r31", "cc"
+    #endif
+    );
+}
+#define asm_set 1
+
+uECC_VLI_API void uECC_vli_rshift1(uECC_word_t *vli, wordcount_t num_words) {
+    volatile uECC_word_t *v = vli;
+    __asm__ volatile (
+    #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
+        "ldi r30, pm_lo8(1f) \n\t"
+        "ldi r31, pm_hi8(1f) \n\t"
+        "sub r30, %[jump] \n\t"
+        "sbc r31, __zero_reg__ \n\t"
+    #endif
+        
+        "add r26, %[num] \n\t"
+        "adc r27, __zero_reg__ \n\t"
+        "ld r0, -x \n\t"
+        "lsr r0 \n\t"
+        "st x, r0 \n\t"
+    #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
+        IJMP
+    #endif
+        
+        REPEAT(DEC(uECC_MAX_WORDS),
+            "ld r0, -x \n\t"
+            "ror r0 \n\t"
+            "st x, r0 \n\t")
+        "1: \n\t"
+        : "+x" (v)
+    #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
+        : [num] "r" (num_words), [jump] "r" ((uint8_t)(3 * (num_words - 1)))
+        : "r0", "r30", "r31", "cc"
+    #else
+        : [num] "r" (num_words)
+        : "r0", "cc"
+    #endif
+    );
+}
+#define asm_rshift1 1
+
+#define ADD_RJPM_TABLE(N)       \
+    "movw r30, %A[result] \n\t" \
+    "rjmp add_%=_" #N " \n\t"
+
+#define ADD_RJPM_DEST(N)     \
+    "add_%=_" #N ":"         \
+    "ld %[clb], x+ \n\t"     \
+    "ld %[rb], y+ \n\t"      \
+    "adc %[clb], %[rb] \n\t" \
+    "st z+, %[clb] \n\t"
+
+uECC_VLI_API uECC_word_t uECC_vli_add(uECC_word_t *result,
+                                      const uECC_word_t *left,
+                                      const uECC_word_t *right,
+                                      wordcount_t num_words) {
+    volatile uECC_word_t *r = result;
+    uint8_t carry;
+    uint8_t right_byte;
+
+    __asm__ volatile (
+    #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
+        "ldi r30, pm_lo8(add_%=_" STR(uECC_MAX_WORDS) ") \n\t"
+        "ldi r31, pm_hi8(add_%=_" STR(uECC_MAX_WORDS) ") \n\t"
+        "sub r30, %[num] \n\t"
+        "sbc r31, __zero_reg__ \n\t"
+    #endif
+    
+        "clc \n\t"
+    #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
+        IJMP
+        REPEATM(uECC_MAX_WORDS, ADD_RJPM_TABLE)
+    #endif
+    
+        REPEATM(uECC_MAX_WORDS, ADD_RJPM_DEST)
+        
+        "mov %[clb], __zero_reg__ \n\t"
+        "adc %[clb], %[clb] \n\t" /* Store carry bit. */
+
+        : "+x" (left), "+y" (right),
+          [clb] "=&r" (carry), [rb] "=&r" (right_byte)
+        : [result] "r" (r), [num] "r" ((uint8_t)(num_words * 2))
+        : "r30", "r31", "cc"
+    );
+    return carry;
+}
+#define asm_add 1
+
+#define SUB_RJPM_TABLE(N)       \
+    "movw r30, %A[result] \n\t" \
+    "rjmp sub_%=_" #N " \n\t"
+
+#define SUB_RJPM_DEST(N)     \
+    "sub_%=_" #N ":"         \
+    "ld %[clb], x+ \n\t"     \
+    "ld %[rb], y+ \n\t"      \
+    "sbc %[clb], %[rb] \n\t" \
+    "st z+, %[clb] \n\t"
+
+uECC_VLI_API uECC_word_t uECC_vli_sub(uECC_word_t *result,
+                                      const uECC_word_t *left,
+                                      const uECC_word_t *right,
+                                      wordcount_t num_words) {
+    volatile uECC_word_t *r = result;
+    uint8_t carry;
+    uint8_t right_byte;
+
+    __asm__ volatile (
+    #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
+        "ldi r30, pm_lo8(sub_%=_" STR(uECC_MAX_WORDS) ") \n\t"
+        "ldi r31, pm_hi8(sub_%=_" STR(uECC_MAX_WORDS) ") \n\t"
+        "sub r30, %[num] \n\t"
+        "sbc r31, __zero_reg__ \n\t"
+    #endif
+    
+        "clc \n\t"
+    #if (uECC_MAX_WORDS != uECC_MIN_WORDS)
+        IJMP
+        REPEATM(uECC_MAX_WORDS, SUB_RJPM_TABLE)
+    #endif
+    
+        REPEATM(uECC_MAX_WORDS, SUB_RJPM_DEST)
+        
+        "mov %[clb], __zero_reg__ \n\t"
+        "adc %[clb], %[clb] \n\t" /* Store carry bit. */
+
+        : "+x" (left), "+y" (right),
+          [clb] "=&r" (carry), [rb] "=&r" (right_byte)
+        : [result] "r" (r), [num] "r" ((uint8_t)(num_words * 2))
+        : "r30", "r31", "cc"
+    );
+    return carry;
+}
+#define asm_sub 1
+
+#if uECC_SUPPORTS_secp160r1
+static const struct uECC_Curve_t curve_secp160r1;
+static void vli_mmod_fast_secp160r1(uECC_word_t *result, uECC_word_t *product) {
+    uint8_t carry = 0;
+    __asm__ volatile (
+        "in r30, __SP_L__ \n\t"
+    	"in r31, __SP_H__ \n\t"
+    	"sbiw r30, 24 \n\t"
+    	"in r0, __SREG__ \n\t"
+    	"cli \n\t"
+    	"out __SP_H__, r31 \n\t"
+    	"out __SREG__, r0 \n\t"
+    	"out __SP_L__, r30 \n\t"
+    	
+    	"adiw r30, 25 \n\t" /* we are shifting by 31 bits, so shift over 4 bytes
+    	                       (+ 1 since z initially points below the stack) */
+        "adiw r26, 40 \n\t" /* end of product */
+        "ld r18, -x \n\t"   /* Load word. */
+        "lsr r18 \n\t"      /* Shift. */
+        "st -z, r18 \n\t"   /* Store the first result word. */
+
+        /* Now we just do the remaining words with the carry bit (using ROR) */
+        REPEAT(19,
+            "ld r18, -x \n\t"
+            "ror r18 \n\t"
+            "st -z, r18 \n\t")
+
+        "eor r18, r18 \n\t" /* r18 = 0 */
+        "ror r18 \n\t"      /* get last bit */
+        "st -z, r18 \n\t"   /* store it */
+
+        "sbiw r30, 3 \n\t" /* move z back to point at tmp */
+        /* now we add right */
+        "ld r18, x+ \n\t"
+        "st z+, r18 \n\t" /* the first 3 bytes do not need to be added */
+        "ld r18, x+ \n\t"
+        "st z+, r18 \n\t"
+        "ld r18, x+ \n\t"
+        "st z+, r18 \n\t"
+
+        "ld r18, x+ \n\t"
+        "ld r19, z \n\t"
+        "add r18, r19 \n\t"
+        "st z+, r18 \n\t"
+
+        /* Now we just do the remaining words with the carry bit (using ADC) */
+        REPEAT(16,
+            "ld r18, x+ \n\t"
+            "ld r19, z \n\t"
+            "adc r18, r19 \n\t"
+            "st z+, r18 \n\t")
+
+        /* Propagate over the remaining bytes of result */
+        "ld r18, z \n\t"
+        "adc r18, r1 \n\t"
+        "st z+, r18 \n\t"
+
+        "ld r18, z \n\t"
+        "adc r18, r1 \n\t"
+        "st z+, r18 \n\t"
+
+        "ld r18, z \n\t"
+        "adc r18, r1 \n\t"
+        "st z+, r18 \n\t"
+
+        "ld r18, z \n\t"
+        "adc r18, r1 \n\t"
+        "st z+, r18 \n\t"
+        
+        "sbiw r30, 24 \n\t" /* move z back to point at tmp */
+        "sbiw r26, 40 \n\t" /* move x back to point at product */
+        
+        /* add low bytes of tmp to product, storing in result */
+        "ld r18, z+ \n\t"
+        "ld r19, x+ \n\t"
+        "add r18, r19 \n\t"
+        "st y+, r18 \n\t"
+        REPEAT(19,
+            "ld r18, z+ \n\t"
+            "ld r19, x+ \n\t"
+            "adc r18, r19 \n\t"
+            "st y+, r18 \n\t")
+        "adc %[carry], __zero_reg__ \n\t" /* Store carry bit (carry flag is cleared). */
+        /* at this point x is at the end of product, y is at the end of result,
+           z is 20 bytes into tmp */
+        "sbiw r28, 20 \n\t" /* move y back to point at result */
+        "adiw r30, 4 \n\t"  /* move z to point to the end of tmp */
+        
+        /* do omega_mult again with the 4 relevant bytes */
+        /* z points to the end of tmp, x points to the end of product */
+        "ld r18, -z \n\t" /* Load word. */
+        "lsr r18 \n\t"    /* Shift. */
+        "st -x, r18 \n\t" /* Store the first result word. */
+        
+        "ld r18, -z \n\t"
+        "ror r18 \n\t"
+        "st -x, r18 \n\t"
+        "ld r18, -z \n\t"
+        "ror r18 \n\t"
+        "st -x, r18 \n\t"
+        "ld r18, -z \n\t"
+        "ror r18 \n\t"
+        "st -x, r18 \n\t"
+        
+        "eor r18, r18 \n\t" /* r18 = 0 */
+        "ror r18 \n\t"      /* get last bit */
+        "st -x, r18 \n\t"   /* store it */
+        
+        "sbiw r26, 3 \n\t" /* move x back to point at beginning */
+        /* now we add a copy of the 4 bytes */
+        "ld r18, z+ \n\t"
+        "st x+, r18 \n\t" /* the first 3 bytes do not need to be added */
+        "ld r18, z+ \n\t"
+        "st x+, r18 \n\t"
+        "ld r18, z+ \n\t"
+        "st x+, r18 \n\t"
+        
+        "ld r18, z+ \n\t"
+        "ld r19, x \n\t"
+        "add r18, r19 \n\t"
+        "st x+, r18 \n\t"
+        
+        /* Propagate over the remaining bytes */
+        "ld r18, x \n\t"
+        "adc r18, r1 \n\t"
+        "st x+, r18 \n\t"
+        
+        "ld r18, x \n\t"
+        "adc r18, r1 \n\t"
+        "st x+, r18 \n\t"
+        
+        "ld r18, x \n\t"
+        "adc r18, r1 \n\t"
+        "st x+, r18 \n\t"
+        
+        "ld r18, x \n\t"
+        "adc r18, r1 \n\t"
+        "st x+, r18 \n\t"
+        
+        /* now z points to the end of tmp, x points to the end of product
+           (y still points at result) */
+        "sbiw r26, 8 \n\t" /* move x back to point at beginning of actual data */
+        /* add into result */
+        "ld r18, x+ \n\t"
+        "ld r19, y \n\t"
+        "add r18, r19 \n\t"
+        "st y+, r18 \n\t"
+        REPEAT(7,
+            "ld r18, x+ \n\t"
+            "ld r19, y \n\t"
+            "adc r18, r19 \n\t"
+            "st y+, r18 \n\t")
+        
+        /* Done adding, now propagate carry bit */
+        REPEAT(12,
+            "ld r18, y \n\t"
+            "adc r18, __zero_reg__ \n\t"
+            "st y+, r18 \n\t")
+        
+        "adc %[carry], __zero_reg__ \n\t" /* Store carry bit (carry flag is cleared). */
+        "sbiw r28, 20 \n\t" /* move y back to point at result */
+        
+        "sbiw r30, 1 \n\t" /* fix stack pointer */
+    	"in r0, __SREG__ \n\t"
+    	"cli \n\t"
+    	"out __SP_H__, r31 \n\t"
+    	"out __SREG__, r0 \n\t"
+    	"out __SP_L__, r30 \n\t"
+        
+        : "+x" (product), [carry] "+r" (carry)
+        : "y" (result)
+        : "r0", "r18", "r19", "r30", "r31", "cc", "memory"
+    );
+
+    if (carry > 0) {
+        --carry;
+        uECC_vli_sub(result, result, curve_secp160r1.p, 20);
+    }
+    if (carry > 0) {
+        uECC_vli_sub(result, result, curve_secp160r1.p, 20);
+    }
+    if (uECC_vli_cmp_unsafe(result, curve_secp160r1.p, 20) > 0) {
+        uECC_vli_sub(result, result, curve_secp160r1.p, 20);
+    }
+}
+#define asm_mmod_fast_secp160r1 1
+#endif /* uECC_SUPPORTS_secp160r1 */
+
+#if uECC_SUPPORTS_secp256r1
+static const struct uECC_Curve_t curve_secp256r1;
+static void vli_mmod_fast_secp256r1(uECC_word_t *result, uECC_word_t *product) {
+    uint8_t carry = 0;
+    __asm__ volatile (
+        "in r30, __SP_L__ \n\t"
+    	"in r31, __SP_H__ \n\t"
+    	"sbiw r30, 37 \n\t"
+    	"in r0, __SREG__ \n\t"
+    	"cli \n\t"
+    	"out __SP_H__, r31 \n\t"
+    	"out __SREG__, r0 \n\t"
+    	"out __SP_L__, r30 \n\t"
+    	
+    	"adiw r30, 1 \n\t"  /* add 1 since z initially points below the stack */
+        "adiw r26, 32 \n\t" /* product + uECC_WORDS */
+        "ldi r25, 0x03 \n\t"
+        "ldi r24, 0xD1 \n\t"
+        "ld r18, x+ \n\t"
+        "ld r19, x+ \n\t"
+        "ld r20, x+ \n\t"
+        "ld r21, x+ \n\t"
+        
+        "mul r24, r18 \n\t"
+        "st z+, r0 \n\t"
+        "mov r22, r1 \n\t"
+        "ldi r23, 0 \n\t"
+        
+        "mul r24, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t" /* can't overflow */
+        "mul r25, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t" /* can't overflow */
+        "st z+, r22 \n\t"
+        "ldi r22, 0 \n\t"
+        
+        "mul r24, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "mul r25, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "st z+, r23 \n\t"
+        "ldi r23, 0 \n\t"
+        
+        "mul r24, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "mul r25, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "st z+, r22 \n\t"
+        "ldi r22, 0 \n\t"
+        
+        /* now we start adding the 2^32 part as well */
+        "add r23, r18 \n\t" // 28
+        "adc r22, r22 \n\t"
+        "ld r18, x+ \n\t"
+        "mul r24, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "mul r25, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "st z+, r23 \n\t"
+        "ldi r23, 0 \n\t"
+        
+        "add r22, r19 \n\t" // 27
+        "adc r23, r23 \n\t"
+        "ld r19, x+ \n\t"
+        "mul r24, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "mul r25, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "st z+, r22 \n\t"
+        "ldi r22, 0 \n\t"
+        
+        REPEAT(6, // 26 - 3
+            "add r23, r20 \n\t"
+            "adc r22, r22 \n\t"
+            "ld r20, x+ \n\t"
+            "mul r24, r20 \n\t"
+            "add r23, r0 \n\t"
+            "adc r22, r1 \n\t"
+            "mul r25, r19 \n\t"
+            "add r23, r0 \n\t"
+            "adc r22, r1 \n\t"
+            "st z+, r23 \n\t"
+            "ldi r23, 0 \n\t"
+            
+            "add r22, r21 \n\t"
+            "adc r23, r23 \n\t"
+            "ld r21, x+ \n\t"
+            "mul r24, r21 \n\t"
+            "add r22, r0 \n\t"
+            "adc r23, r1 \n\t"
+            "mul r25, r20 \n\t"
+            "add r22, r0 \n\t"
+            "adc r23, r1 \n\t"
+            "st z+, r22 \n\t"
+            "ldi r22, 0 \n\t"
+            
+            "add r23, r18 \n\t"
+            "adc r22, r22 \n\t"
+            "ld r18, x+ \n\t"
+            "mul r24, r18 \n\t"
+            "add r23, r0 \n\t"
+            "adc r22, r1 \n\t"
+            "mul r25, r21 \n\t"
+            "add r23, r0 \n\t"
+            "adc r22, r1 \n\t"
+            "st z+, r23 \n\t"
+            "ldi r23, 0 \n\t"
+            
+            "add r22, r19 \n\t"
+            "adc r23, r23 \n\t"
+            "ld r19, x+ \n\t"
+            "mul r24, r19 \n\t"
+            "add r22, r0 \n\t"
+            "adc r23, r1 \n\t"
+            "mul r25, r18 \n\t"
+            "add r22, r0 \n\t"
+            "adc r23, r1 \n\t"
+            "st z+, r22 \n\t"
+            "ldi r22, 0 \n\t")
+
+        "add r23, r20 \n\t" // 2
+        "adc r22, r22 \n\t"
+        "ld r20, x+ \n\t"
+        "mul r24, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "mul r25, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "st z+, r23 \n\t"
+        "ldi r23, 0 \n\t"
+        
+        "add r22, r21 \n\t" // 1
+        "adc r23, r23 \n\t"
+        "ld r21, x+ \n\t"
+        "mul r24, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "mul r25, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "st z+, r22 \n\t"
+        "ldi r22, 0 \n\t"
+        
+        /* Now finish the carries etc */
+        "add r23, r18 \n\t"
+        "adc r22, r22 \n\t"
+        "mul r25, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "st z+, r23 \n\t"
+        "ldi r23, 0 \n\t"
+        
+        "add r22, r19 \n\t"
+        "adc r23, r23 \n\t"
+        "st z+, r22 \n\t"
+        "ldi r22, 0 \n\t"
+        
+        "add r23, r20 \n\t"
+        "adc r22, r22 \n\t"
+        "st z+, r23 \n\t"
+        "ldi r23, 0 \n\t"
+        
+        "add r22, r21 \n\t"
+        "adc r23, r23 \n\t"
+        "st z+, r22 \n\t"
+        "st z+, r23 \n\t"
+        "eor r1, r1 \n\t" /* make r1 be 0 again */
+        
+        "sbiw r30, 37 \n\t" /* move z back to point at tmp */
+        "subi r26, 64 \n\t" /* move x back to point at product */
+        "sbc r27, __zero_reg__ \n\t"
+        
+        /* add low bytes of tmp to product, storing in result */
+        "ld r18, z+ \n\t"
+        "ld r19, x+ \n\t"
+        "add r18, r19 \n\t"
+        "st y+, r18 \n\t"
+        REPEAT(31,
+            "ld r18, z+ \n\t"
+            "ld r19, x+ \n\t"
+            "adc r18, r19 \n\t"
+            "st y+, r18 \n\t")
+        
+        "adc %[carry], __zero_reg__ \n\t" /* Store carry bit (carry flag is cleared). */
+        /* at this point x is at the end of product, y is at the end of result,
+           z is 32 bytes into tmp */
+        "sbiw r28, 32 \n\t" /* move y back to point at result */
+
+        /* do omega_mult again with the 5 relevant bytes */
+        /* z points to tmp + uECC_WORDS, x points to the end of product */
+        "sbiw r26, 32 \n\t" /* shift x back to point into the product buffer
+                               (we can overwrite it now) */
+        "ld r18, z+ \n\t"
+        "ld r19, z+ \n\t"
+        "ld r20, z+ \n\t"
+        "ld r21, z+ \n\t"
+        
+        "mul r24, r18 \n\t"
+        "st x+, r0 \n\t"
+        "mov r22, r1 \n\t"
+        "ldi r23, 0 \n\t"
+        
+        "mul r24, r19 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t" /* can't overflow */
+        "mul r25, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t" /* can't overflow */
+        "st x+, r22 \n\t"
+        "ldi r22, 0 \n\t"
+        
+        "mul r24, r20 \n\t"
+        "add r23, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "mul r25, r19 \n\t"
+        "add r23, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "st x+, r23 \n\t"
+        "ldi r23, 0 \n\t"
+        
+        "mul r24, r21 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "mul r25, r20 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "st x+, r22 \n\t"
+        "ldi r22, 0 \n\t"
+        
+        "add r23, r18 \n\t"
+        "adc r22, r22 \n\t"
+        "ld r18, z+ \n\t"
+        "mul r24, r18 \n\t"
+        "add r23, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "mul r25, r21 \n\t"
+        "add r23, r0 \n\t"
+        "adc r22, r1 \n\t"
+        "st x+, r23 \n\t"
+        "ldi r23, 0 \n\t"
+        
+        /* Now finish the carries etc */
+        "add r22, r19 \n\t"
+        "adc r23, r23 \n\t"
+        "mul r25, r18 \n\t"
+        "add r22, r0 \n\t"
+        "adc r23, r1 \n\t"
+        "st x+, r22 \n\t"
+        "ldi r22, 0 \n\t"
+        
+        "add r23, r20 \n\t"
+        "adc r22, r22 \n\t"
+        "st x+, r23 \n\t"
+        "ldi r23, 0 \n\t"
+        
+        "add r22, r21 \n\t"
+        "adc r23, r23 \n\t"
+        "st x+, r22 \n\t"
+        "ldi r22, 0 \n\t"
+        
+        "add r23, r18 \n\t"
+        "adc r22, r22 \n\t"
+        "st x+, r23 \n\t"
+        "st x+, r22 \n\t"
+        "eor r1, r1 \n\t" /* make r1 be 0 again */
+        
+        /* now z points to the end of tmp, x points to the end of product
+           (y still points at result) */
+        "sbiw r26, 10 \n\t" /* move x back to point at beginning of actual data */
+        /* add into result */
+        "ld r18, x+ \n\t"
+        "ld r19, y \n\t"
+        "add r18, r19 \n\t"
+        "st y+, r18 \n\t"
+        REPEAT(9,
+            "ld r18, x+ \n\t"
+            "ld r19, y \n\t"
+            "adc r18, r19 \n\t"
+            "st y+, r18 \n\t")
+        
+        /* Done adding, now propagate carry bit */
+        REPEAT(22,
+            "ld r18, y \n\t"
+            "adc r18, __zero_reg__ \n\t"
+            "st y+, r18 \n\t")
+        
+        "adc %[carry], __zero_reg__ \n\t"    /* Store carry bit (carry flag is cleared). */
+        "sbiw r28, 32 \n\t" /* move y back to point at result */
+        
+        "sbiw r30, 1 \n\t" /* fix stack pointer */
+    	"in r0, __SREG__ \n\t"
+    	"cli \n\t"
+    	"out __SP_H__, r31 \n\t"
+    	"out __SREG__, r0 \n\t"
+    	"out __SP_L__, r30 \n\t"
+        
+        : "+x" (product), [carry] "+r" (carry)
+        : "y" (result)
+        : "r0", "r18", "r19", "r20", "r21", "r22", "r23", "r24", "r25", "r30", "r31", "cc", "memory"
+    );
+    
+    if (carry > 0) {
+        --carry;
+        uECC_vli_sub(result, result, curve_secp256r1.p, 32);
+    }
+    if (carry > 0) {
+        uECC_vli_sub(result, result, curve_secp256r1.p, 32);
+    }
+    if (uECC_vli_cmp_unsafe(result, curve_secp256r1.p, 32) > 0) {
+        uECC_vli_sub(result, result, curve_secp256r1.p, 32);
+    }
+}
+#define asm_mmod_fast_secp256r1 1
+#endif /* uECC_SUPPORTS_secp256r1 */
+
+#endif /* (uECC_OPTIMIZATION_LEVEL >= 2) */
+
+#if !asm_add
+uECC_VLI_API uECC_word_t uECC_vli_add(uECC_word_t *result,
+                                      const uECC_word_t *left,
+                                      const uECC_word_t *right,
+                                      wordcount_t num_words) {
+    volatile uECC_word_t *r = result;
+    uint8_t carry = 0;
+    uint8_t left_byte;
+    uint8_t right_byte;
+
+    __asm__ volatile (
+        "clc \n\t"
+        
+        "1: \n\t"
+        "ld %[left], x+ \n\t"  /* Load left byte. */
+        "ld %[right], y+ \n\t" /* Load right byte. */
+        "adc %[left], %[right] \n\t" /* Add. */
+        "st z+, %[left] \n\t"  /* Store the result. */
+        "dec %[i] \n\t"
+        "brne 1b \n\t"
+        
+        "adc %[carry], %[carry] \n\t" /* Store carry bit. */
+
+        : "+z" (r), "+x" (left), "+y" (right), [i] "+r" (num_words),
+            [carry] "+r" (carry), [left] "=&r" (left_byte), [right] "=&r" (right_byte)
+        : 
+        : "cc"
+    );
+    return carry;
+}
+#define asm_add 1
+#endif
+
+#if !asm_sub
+uECC_VLI_API uECC_word_t uECC_vli_sub(uECC_word_t *result,
+                                      const uECC_word_t *left,
+                                      const uECC_word_t *right,
+                                      wordcount_t num_words) {
+    volatile uECC_word_t *r = result;
+    uint8_t borrow = 0;
+    uint8_t left_byte;
+    uint8_t right_byte;
+
+    __asm__ volatile (
+        "clc \n\t"
+        
+        "1: \n\t"
+        "ld %[left], x+ \n\t"  /* Load left byte. */
+        "ld %[right], y+ \n\t" /* Load right byte. */
+        "sbc %[left], %[right] \n\t" /* Subtract. */
+        "st z+, %[left] \n\t"  /* Store the result. */
+        "dec %[i] \n\t"
+        "brne 1b \n\t"
+        
+        "adc %[borrow], %[borrow] \n\t" /* Store carry bit in borrow. */
+
+        : "+z" (r), "+x" (left), "+y" (right), [i] "+r" (i),
+            [borrow] "+r" (borrow), [left] "=&r" (left_byte), [right] "=&r" (right_byte)
+        :
+        : "cc"
+    );
+    return borrow;
+}
+#define asm_sub 1
+#endif
+
+#if !asm_mult
+__attribute((noinline))
+uECC_VLI_API void uECC_vli_mult(uECC_word_t *result,
+                                const uECC_word_t *left,
+                                const uECC_word_t *right,
+                                wordcount_t num_words) {
+    volatile uECC_word_t *r = result;
+    uint8_t r0 = 0;
+    uint8_t r1 = 0;
+    uint8_t r2 = 0;
+    uint8_t zero = 0;
+    uint8_t k, i;
+    
+    __asm__ volatile (
+        "ldi %[k], 1 \n\t" /* k = 1; k < num_words; ++k */
+        
+        "1: \n\t"
+        "ldi %[i], 0 \n\t"  /* i = 0; i < k; ++i */
+        
+        "add r28, %[k] \n\t" /* pre-add right ptr */
+        "adc r29, %[zero] \n\t"
+        
+        "2: \n\t"
+        "ld r0, x+ \n\t"
+        "ld r1, -y \n\t"
+        "mul r0, r1 \n\t"
+        
+        "add %[r0], r0 \n\t"
+        "adc %[r1], r1 \n\t"
+        "adc %[r2], %[zero] \n\t"
+        
+        "inc %[i] \n\t"
+        "cp %[i], %[k] \n\t"
+        "brlo 2b \n\t" /* loop if i < k */
+        
+        "sub r26, %[k] \n\t" /* fix up left ptr */
+        "sbc r27, %[zero] \n\t"
+        
+        "st z+, %[r0] \n\t"  /* Store the result. */
+        "mov %[r0], %[r1] \n\t"
+        "mov %[r1], %[r2] \n\t"
+        "mov %[r2], %[zero] \n\t"
+        
+        "inc %[k] \n\t"
+        "cp %[k], %[num] \n\t"
+        "brlo 1b \n\t" /* loop if k < num_words */
+        
+        /* second half */
+        "mov %[k], %[num] \n\t" /* k = num_words; k > 0; --k */
+        "add r28, %[num] \n\t" /* move right ptr to point at the end of right */
+        "adc r29, %[zero] \n\t"
+        
+        "1: \n\t"
+        "ldi %[i], 0 \n\t" /* i = 0; i < k; ++i */
+        
+        "2: \n\t"
+        "ld r0, x+ \n\t"
+        "ld r1, -y \n\t"
+        "mul r0, r1 \n\t"
+        
+        "add %[r0], r0 \n\t"
+        "adc %[r1], r1 \n\t"
+        "adc %[r2], %[zero] \n\t"
+        
+        "inc %[i] \n\t"
+        "cp %[i], %[k] \n\t"
+        "brlo 2b \n\t" /* loop if i < k */
+        
+        "add r28, %[k] \n\t" /* fix up right ptr */
+        "adc r29, %[zero] \n\t"
+        
+        "st z+, %[r0] \n\t"  /* Store the result. */
+        "mov %[r0], %[r1] \n\t"
+        "mov %[r1], %[r2] \n\t"
+        "mov %[r2], %[zero] \n\t"
+        
+        "dec %[k] \n\t"
+        "sub r26, %[k] \n\t" /* fix up left ptr (after k is decremented, so next time
+                                we start 1 higher) */
+        "sbc r27, %[zero] \n\t"
+        
+        "cp %[k], %[zero] \n\t"
+        "brne 1b \n\t" /* loop if k > 0 */
+        
+        "st z+, %[r0] \n\t"  /* Store last result byte. */
+        "eor r1, r1 \n\t" /* fix r1 to be 0 again */
+    
+        : "+z" (result), "+x" (left), "+y" (right),
+          [r0] "+r" (r0), [r1] "+r" (r1), [r2] "+r" (r2),
+          [zero] "+r" (zero), [num] "+r" (num_words),
+          [k] "=&r" (k), [i] "=&r" (i)
+        : 
+        : "r0", "cc"
+    );
+}
+#define asm_mult 1
+#endif
+
+#if (uECC_SQUARE_FUNC && !asm_square)
+uECC_VLI_API void uECC_vli_square(uECC_word_t *result,
+                                  const uECC_word_t *left,
+                                  wordcount_t num_words) {
+    volatile uECC_word_t *r = result;
+    uint8_t r0 = 0;
+    uint8_t r1 = 0;
+    uint8_t r2 = 0;
+    uint8_t zero = 0;
+    uint8_t k;
+    
+    __asm__ volatile (
+        "ldi %[k], 1 \n\t" /* k = 1; k < num_words * 2; ++k */
+        
+        "1: \n\t"
+        
+        "movw r26, %[orig] \n\t"  /* copy orig ptr to 'left' ptr */
+        "movw r30, %[orig] \n\t"  /* copy orig ptr to 'right' ptr */
+        "cp %[k], %[num] \n\t"
+        "brlo 2f \n\t"
+        "breq 2f \n\t"
+        
+        /* when k > num_words, we start from (k - num_words) on the 'left' ptr */
+        "add r26, %[k] \n\t"
+        "adc r27, %[zero] \n\t"
+        "sub r26, %[num] \n\t"
+        "sbc r27, %[zero] \n\t"
+        "add r30, %[num] \n\t" /* move right ptr to point at the end */
+        "adc r31, %[zero] \n\t"
+        "rjmp 3f \n\t"
+        
+        "2: \n\t" /* when k <= num_words, we add k to the 'right' ptr */
+        "add r30, %[k] \n\t" /* pre-add 'right' ptr */
+        "adc r31, %[zero] \n\t"
+        
+        "3: \n\t"
+        "ld r0, x+ \n\t"
+        "cp r26, r30 \n\t" /* if left == right here, then we are done after this mult
+                              (and we don't need to double) */
+        "breq 4f \n\t"
+        "ld r1, -z \n\t"
+        "mul r0, r1 \n\t"
+        
+        /* add twice since it costs the same as doubling */
+        "add %[r0], r0 \n\t"
+        "adc %[r1], r1 \n\t"
+        "adc %[r2], %[zero] \n\t"
+        "add %[r0], r0 \n\t"
+        "adc %[r1], r1 \n\t"
+        "adc %[r2], %[zero] \n\t"
+        
+        "cpse r26, r30 \n\t" /* if left == right here, then we are done */
+        "rjmp 3b \n\t"
+        "rjmp 5f \n\t" /* skip code for non-doubled mult */
+        
+        "4: \n\t"
+        "ld r1, -z \n\t"
+        "mul r0, r1 \n\t"
+        "add %[r0], r0 \n\t"
+        "adc %[r1], r1 \n\t"
+        "adc %[r2], %[zero] \n\t"
+        
+        "5: \n\t"
+        "movw r30, %[result] \n\t" /* make z point to result */
+        "st z+, %[r0] \n\t"        /* Store the result. */
+        "movw %[result], r30 \n\t" /* update result ptr*/
+        "mov %[r0], %[r1] \n\t"
+        "mov %[r1], %[r2] \n\t"
+        "mov %[r2], %[zero] \n\t"
+        
+        "inc %[k] \n\t"
+        "cp %[k], %[max] \n\t"
+        "brlo 1b \n\t" /* loop if k < num_words * 2 */
+        
+        "movw r30, %[result] \n\t"  /* make z point to result */
+        "st z+, %[r0] \n\t"  /* Store last result byte. */
+        "eor r1, r1 \n\t" /* fix r1 to be 0 again */
+    
+        : [result] "+r" (r),
+          [r0] "+r" (r0), [r1] "+r" (r1), [r2] "+r" (r2), [zero] "+r" (zero),
+          [k] "=&a" (k)
+        : [orig] "r" (left), [max] "r" ((uint8_t)(2 * num_words)),
+          [num] "r" (num_words)
+        : "r0", "r26", "r27", "r30", "r31", "cc"
+    );
+}
+#define asm_square 1
+#endif /* uECC_SQUARE_FUNC && !asm_square */
+
+#endif /* _UECC_ASM_AVR_H_ */
--- a/stm32/mk4-bootloader/micro-ecc/asm_avr_mult_square.inc
+++ b/stm32/mk4-bootloader/micro-ecc/asm_avr_mult_square.inc
--- a/stm32/mk4-bootloader/micro-ecc/curve-specific.inc
+++ b/stm32/mk4-bootloader/micro-ecc/curve-specific.inc
--- a/stm32/mk4-bootloader/micro-ecc/emk_project.py
+++ b/stm32/mk4-bootloader/micro-ecc/emk_project.py
@ -0,0 +1,127 @@
+import os
+
+c, link, asm, utils = emk.module("c", "link", "asm", "utils")
+
+default_compile_flags = ["-fvisibility=hidden", "-Wall", "-Wextra", "-Wshadow", "-Werror", "-Wno-missing-field-initializers", "-Wno-unused-parameter", \
+    "-Wno-comment", "-Wno-unused", "-Wno-unknown-pragmas"]
+default_link_flags = []
+opt_flags = {"dbg":["-g"], "std":["-O2"], "max":["-O3"], "small":["-Os"]}
+opt_link_flags = {"dbg":[], "std":[], "max":[], "small":[]}
+c_flags = ["-std=c99"]
+cxx_flags = ["-std=c++11", "-Wno-reorder", "-fno-rtti", "-fno-exceptions"]
+c_link_flags = []
+cxx_link_flags = ["-fno-rtti", "-fno-exceptions"]
+
+def setup_build_dir():
+    build_arch = None
+    if "arch" in emk.options:
+        build_arch = emk.options["arch"]
+    elif not emk.cleaning:
+        build_arch = "osx"
+    emk.options["arch"] = build_arch
+
+    opt_level = None
+    if "opt" in emk.options:
+        level = emk.options["opt"]
+        if level in opt_flags:
+            opt_level = level
+        else:
+            emk.log.warning("Unknown optimization level '%s'" % (level))
+    elif not emk.cleaning:
+        opt_level = "dbg"
+    emk.options["opt"] = opt_level
+
+    dirs = ["__build__"]
+    if build_arch:
+        dirs.append(build_arch)
+    if opt_level:
+        dirs.append(opt_level)
+    emk.build_dir = os.path.join(*dirs)
+
+def setup_osx():
+    global c
+    global link
+
+    flags = [("-arch", "x86_64"), "-fno-common", "-Wnewline-eof"]
+    c.flags.extend(flags)
+    c.cxx.flags += ["-stdlib=libc++"]
+    link.cxx.flags += ["-stdlib=libc++"]
+
+    link_flags = [("-arch", "x86_64")]
+    link.local_flags.extend(link_flags)
+
+def setup_avr():
+    global c
+    global link
+
+    c.compiler = c.GccCompiler("/Projects/avr-tools/bin/avr-")
+    c.flags += ["-mmcu=atmega256rfr2", "-ffunction-sections", "-fdata-sections"]
+    link.linker = link.GccLinker("/Projects/avr-tools/bin/avr-")
+    link.flags += ["-mmcu=atmega256rfr2", "-mrelax", "-Wl,--gc-sections"]
+    link.strip = True
+
+def setup_arm_thumb():
+    global c
+    global link
+    global asm
+    global utils
+
+    asm.assembler = asm.GccAssembler("/cross/arm_cortex/bin/arm-none-eabi-")
+    c.compiler = c.GccCompiler("/cross/arm_cortex/bin/arm-none-eabi-")
+    link.linker = link.GccLinker("/cross/arm_cortex/bin/arm-none-eabi-")
+
+    c.flags.extend(["-mcpu=cortex-m0", "-mthumb", "-ffunction-sections", "-fdata-sections", "-fno-builtin-fprintf", "-fno-builtin-printf"])
+    c.defines["LPC11XX"] = 1
+    
+    link.local_flags.extend(["-mcpu=cortex-m0", "-mthumb", "-nostartfiles", "-nostdlib", "-Wl,--gc-sections"])
+    link.local_flags.extend(["-Tflash.lds", "-L/Projects/lpc11xx/core", "/Projects/lpc11xx/core/" + emk.build_dir + "/board_cstartup.o"])
+    link.local_syslibs += ["gcc"]
+    link.depdirs += ["/Projects/lpc11xx/stdlib"]
+
+    def do_objcopy(produces, requires):
+        utils.call("/cross/arm_cortex/bin/arm-none-eabi-objcopy", "-O", "binary", requires[0], produces[0])
+
+    def handle_exe(path):
+        emk.depend(path, "/Projects/lpc11xx/core/" + emk.build_dir + "/board_cstartup.o")
+        emk.rule(do_objcopy, path + ".bin", path, cwd_safe=True, ex_safe=True)
+        emk.autobuild(path + ".bin")
+
+    link.exe_funcs.append(handle_exe)
+    link.strip = True
+    
+    emk.recurse("/Projects/lpc11xx/core")
+
+def setup_linux_rpi():
+    global c
+    global link
+
+    c.compiler = c.GccCompiler("/Volumes/xtools/arm-none-linux-gnueabi/bin/arm-none-linux-gnueabi-")
+    link.linker = link.GccLinker("/Volumes/xtools/arm-none-linux-gnueabi/bin/arm-none-linux-gnueabi-")
+    
+    c.flags.extend(["-fomit-frame-pointer"])
+
+setup_build_dir()
+
+setup_funcs = {"osx":setup_osx, "avr":setup_avr, "arm_thumb":setup_arm_thumb, "rpi": setup_linux_rpi}
+
+if not emk.cleaning:
+    build_arch = emk.options["arch"]
+    opt_level = emk.options["opt"]
+
+    c.flags.extend(default_compile_flags)
+    c.flags.extend(opt_flags[opt_level])
+    c.c.flags.extend(c_flags)
+    c.cxx.flags.extend(cxx_flags)
+    link.local_flags.extend(default_link_flags)
+    link.local_flags.extend(opt_link_flags[opt_level])
+    link.c.local_flags.extend(c_link_flags)
+    link.cxx.local_flags.extend(cxx_link_flags)
+
+    c.include_dirs.append("$:proj:$")
+
+    if build_arch in setup_funcs:
+        setup_funcs[build_arch]()
+    else:
+        raise emk.BuildError("Unknown target arch '%s'" % (build_arch))
+
+    c.defines["TARGET_ARCH_" + build_arch.upper()] = 1
--- a/stm32/mk4-bootloader/micro-ecc/emk_rules.py
+++ b/stm32/mk4-bootloader/micro-ecc/emk_rules.py
@ -0,0 +1,3 @@
+c, link = emk.module("c", "link")
+
+emk.subdir("test")
--- a/stm32/mk4-bootloader/micro-ecc/platform-specific.inc
+++ b/stm32/mk4-bootloader/micro-ecc/platform-specific.inc
@ -0,0 +1,67 @@
+/* Copyright 2015, Kenneth MacKay. Licensed under the BSD 2-clause license. */
+
+#ifndef _UECC_PLATFORM_SPECIFIC_H_
+#define _UECC_PLATFORM_SPECIFIC_H_
+
+#include "types.h"
+
+#if (defined(_WIN32) || defined(_WIN64))
+/* Windows */
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <wincrypt.h>
+
+static int default_RNG(uint8_t *dest, unsigned size) {
+    HCRYPTPROV prov;
+    if (!CryptAcquireContext(&prov, NULL, NULL, PROV_RSA_FULL, CRYPT_VERIFYCONTEXT)) {
+        return 0;
+    }
+
+    CryptGenRandom(prov, size, (BYTE *)dest);
+    CryptReleaseContext(prov, 0);
+    return 1;
+}
+#define default_RNG_defined 1
+
+#elif defined(unix) || defined(__linux__) || defined(__unix__) || defined(__unix) || \
+    (defined(__APPLE__) && defined(__MACH__)) || defined(uECC_POSIX)
+
+/* Some POSIX-like system with /dev/urandom or /dev/random. */
+#include <sys/types.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#ifndef O_CLOEXEC
+    #define O_CLOEXEC 0
+#endif
+
+static int default_RNG(uint8_t *dest, unsigned size) {
+    int fd = open("/dev/urandom", O_RDONLY | O_CLOEXEC);
+    if (fd == -1) {
+        fd = open("/dev/random", O_RDONLY | O_CLOEXEC);
+        if (fd == -1) {
+            return 0;
+        }
+    }
+    
+    char *ptr = (char *)dest;
+    size_t left = size;
+    while (left > 0) {
+        ssize_t bytes_read = read(fd, ptr, left);
+        if (bytes_read <= 0) { // read failed
+            close(fd);
+            return 0;
+        }
+        left -= bytes_read;
+        ptr += bytes_read;
+    }
+    
+    close(fd);
+    return 1;
+}
+#define default_RNG_defined 1
+
+#endif /* platform */
+
+#endif /* _UECC_PLATFORM_SPECIFIC_H_ */
--- a/stm32/mk4-bootloader/micro-ecc/types.h
+++ b/stm32/mk4-bootloader/micro-ecc/types.h
@ -0,0 +1,105 @@
+/* Copyright 2015, Kenneth MacKay. Licensed under the BSD 2-clause license. */
+
+#ifndef _UECC_TYPES_H_
+#define _UECC_TYPES_H_
+
+// PDG was here
+#define __AVR__ 0
+
+#ifndef uECC_PLATFORM
+    #if __AVR__
+        #define uECC_PLATFORM uECC_avr
+    #elif defined(__thumb2__) || defined(_M_ARMT) /* I think MSVC only supports Thumb-2 targets */
+        #define uECC_PLATFORM uECC_arm_thumb2
+    #elif defined(__thumb__)
+        #define uECC_PLATFORM uECC_arm_thumb
+    #elif defined(__arm__) || defined(_M_ARM)
+        #define uECC_PLATFORM uECC_arm
+    #elif defined(__aarch64__)
+        #define uECC_PLATFORM uECC_arm64
+    #elif defined(__i386__) || defined(_M_IX86) || defined(_X86_) || defined(__I86__)
+        #define uECC_PLATFORM uECC_x86
+    #elif defined(__amd64__) || defined(_M_X64)
+        #define uECC_PLATFORM uECC_x86_64
+    #else
+        #define uECC_PLATFORM uECC_arch_other
+    #endif
+#endif
+
+#ifndef uECC_WORD_SIZE
+    #if uECC_PLATFORM == uECC_avr
+        #define uECC_WORD_SIZE 1
+    #elif (uECC_PLATFORM == uECC_x86_64 || uECC_PLATFORM == uECC_arm64)
+        #define uECC_WORD_SIZE 8
+    #else
+        #define uECC_WORD_SIZE 4
+    #endif
+#endif
+
+#if (uECC_WORD_SIZE != 1) && (uECC_WORD_SIZE != 4) && (uECC_WORD_SIZE != 8)
+    #error "Unsupported value for uECC_WORD_SIZE"
+#endif
+
+#if ((uECC_PLATFORM == uECC_avr) && (uECC_WORD_SIZE != 1))
+    #pragma message ("uECC_WORD_SIZE must be 1 for AVR")
+    #undef uECC_WORD_SIZE
+    #define uECC_WORD_SIZE 1
+#endif
+
+#if ((uECC_PLATFORM == uECC_arm || uECC_PLATFORM == uECC_arm_thumb || \
+        uECC_PLATFORM ==  uECC_arm_thumb2) && \
+     (uECC_WORD_SIZE != 4))
+    #pragma message ("uECC_WORD_SIZE must be 4 for ARM")
+    #undef uECC_WORD_SIZE
+    #define uECC_WORD_SIZE 4
+#endif
+
+#ifndef __clang_major__
+    #define SUPPORTS_INT128 0
+#else
+#if defined(__SIZEOF_INT128__) || ((__clang_major__ * 100 + __clang_minor__) >= 302)
+    #define SUPPORTS_INT128 1
+#else
+    #define SUPPORTS_INT128 0
+#endif
+#endif
+
+typedef int8_t wordcount_t;
+typedef int16_t bitcount_t;
+typedef int8_t cmpresult_t;
+
+#if (uECC_WORD_SIZE == 1)
+
+typedef uint8_t uECC_word_t;
+typedef uint16_t uECC_dword_t;
+
+#define HIGH_BIT_SET 0x80
+#define uECC_WORD_BITS 8
+#define uECC_WORD_BITS_SHIFT 3
+#define uECC_WORD_BITS_MASK 0x07
+
+#elif (uECC_WORD_SIZE == 4)
+
+typedef uint32_t uECC_word_t;
+typedef uint64_t uECC_dword_t;
+
+#define HIGH_BIT_SET 0x80000000
+#define uECC_WORD_BITS 32
+#define uECC_WORD_BITS_SHIFT 5
+#define uECC_WORD_BITS_MASK 0x01F
+
+#elif (uECC_WORD_SIZE == 8)
+
+typedef uint64_t uECC_word_t;
+#if SUPPORTS_INT128
+typedef unsigned __int128 uECC_dword_t;
+#endif
+
+#define HIGH_BIT_SET 0x8000000000000000ull
+#define uECC_WORD_BITS 64
+#define uECC_WORD_BITS_SHIFT 6
+#define uECC_WORD_BITS_MASK 0x03F
+
+#endif /* uECC_WORD_SIZE */
+
+#endif /* _UECC_TYPES_H_ */
--- a/stm32/mk4-bootloader/micro-ecc/uECC.c
+++ b/stm32/mk4-bootloader/micro-ecc/uECC.c
--- a/stm32/mk4-bootloader/micro-ecc/uECC.h
+++ b/stm32/mk4-bootloader/micro-ecc/uECC.h
@ -0,0 +1,351 @@
+/* Copyright 2014, Kenneth MacKay. Licensed under the BSD 2-clause license. */
+
+#ifndef _UECC_H_
+#define _UECC_H_
+
+#include <stdint.h>
+
+// PDG: Lint issues
+#define asm_rshift1		0
+#define asm_clear		0
+#define asm_set			0
+#define default_RNG_defined			0
+
+#define uECC_PLATFORM uECC_arm_thumb
+#define uECC_SUPPORTS_secp160r1 0
+#define uECC_SUPPORTS_secp192r1 0
+#define uECC_SUPPORTS_secp224r1 0
+#define uECC_SUPPORTS_secp256r1 1
+#define uECC_SUPPORTS_secp256k1 1
+#define uECC_SUPPORT_COMPRESSED_POINT 1
+
+// we need to sign stuff
+#undef NO_SIGNING
+
+/* Platform selection options.
+If uECC_PLATFORM is not defined, the code will try to guess it based on compiler macros.
+Possible values for uECC_PLATFORM are defined below: */
+#define uECC_arch_other 0
+#define uECC_x86        1
+#define uECC_x86_64     2
+#define uECC_arm        3
+#define uECC_arm_thumb  4
+#define uECC_arm_thumb2 5
+#define uECC_arm64      6
+#define uECC_avr        7
+
+/* If desired, you can define uECC_WORD_SIZE as appropriate for your platform (1, 4, or 8 bytes).
+If uECC_WORD_SIZE is not explicitly defined then it will be automatically set based on your
+platform. */
+
+/* Optimization level; trade speed for code size.
+   Larger values produce code that is faster but larger.
+   Currently supported values are 0 - 3; 0 is unusably slow for most applications. */
+#ifndef uECC_OPTIMIZATION_LEVEL
+    #define uECC_OPTIMIZATION_LEVEL 2
+#endif
+
+/* uECC_SQUARE_FUNC - If enabled (defined as nonzero), this will cause a specific function to be
+used for (scalar) squaring instead of the generic multiplication function. This can make things
+faster somewhat faster, but increases the code size. */
+#ifndef uECC_SQUARE_FUNC
+    #define uECC_SQUARE_FUNC 0
+#endif
+
+/* Curve support selection. Set to 0 to remove that curve. */
+#ifndef uECC_SUPPORTS_secp160r1
+    #define uECC_SUPPORTS_secp160r1 1
+#endif
+#ifndef uECC_SUPPORTS_secp192r1
+    #define uECC_SUPPORTS_secp192r1 1
+#endif
+#ifndef uECC_SUPPORTS_secp224r1
+    #define uECC_SUPPORTS_secp224r1 1
+#endif
+#ifndef uECC_SUPPORTS_secp256r1
+    #define uECC_SUPPORTS_secp256r1 1
+#endif
+#ifndef uECC_SUPPORTS_secp256k1
+    #define uECC_SUPPORTS_secp256k1 1
+#endif
+
+/* Specifies whether compressed point format is supported.
+   Set to 0 to disable point compression/decompression functions. */
+#ifndef uECC_SUPPORT_COMPRESSED_POINT
+    #define uECC_SUPPORT_COMPRESSED_POINT 1
+#endif
+
+struct uECC_Curve_t;
+typedef const struct uECC_Curve_t * uECC_Curve;
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#if uECC_SUPPORTS_secp160r1
+uECC_Curve uECC_secp160r1(void);
+#endif
+#if uECC_SUPPORTS_secp192r1
+uECC_Curve uECC_secp192r1(void);
+#endif
+#if uECC_SUPPORTS_secp224r1
+uECC_Curve uECC_secp224r1(void);
+#endif
+#if uECC_SUPPORTS_secp256r1
+uECC_Curve uECC_secp256r1(void);
+#endif
+#if uECC_SUPPORTS_secp256k1
+uECC_Curve uECC_secp256k1(void);
+#endif
+
+/* uECC_RNG_Function type
+The RNG function should fill 'size' random bytes into 'dest'. It should return 1 if
+'dest' was filled with random data, or 0 if the random data could not be generated.
+The filled-in values should be either truly random, or from a cryptographically-secure PRNG.
+
+A correctly functioning RNG function must be set (using uECC_set_rng()) before calling
+uECC_make_key() or uECC_sign().
+
+Setting a correctly functioning RNG function improves the resistance to side-channel attacks
+for uECC_shared_secret() and uECC_sign_deterministic().
+
+A correct RNG function is set by default when building for Windows, Linux, or OS X.
+If you are building on another POSIX-compliant system that supports /dev/random or /dev/urandom,
+you can define uECC_POSIX to use the predefined RNG. For embedded platforms there is no predefined
+RNG function; you must provide your own.
+*/
+typedef int (*uECC_RNG_Function)(uint8_t *dest, unsigned size);
+
+/* uECC_set_rng() function.
+Set the function that will be used to generate random bytes. The RNG function should
+return 1 if the random data was generated, or 0 if the random data could not be generated.
+
+On platforms where there is no predefined RNG function (eg embedded platforms), this must
+be called before uECC_make_key() or uECC_sign() are used.
+
+Inputs:
+    rng_function - The function that will be used to generate random bytes.
+*/
+void uECC_set_rng(uECC_RNG_Function rng_function);
+
+/* uECC_make_key() function.
+Create a public/private key pair.
+
+Outputs:
+    public_key  - Will be filled in with the public key. Must be at least 2 * the curve size
+                  (in bytes) long. For example, if the curve is secp256r1, public_key must be 64
+                  bytes long.
+    private_key - Will be filled in with the private key. Must be as long as the curve order; this
+                  is typically the same as the curve size, except for secp160r1. For example, if the
+                  curve is secp256r1, private_key must be 32 bytes long.
+                  
+                  For secp160r1, private_key must be 21 bytes long! Note that the first byte will 
+                  almost always be 0 (there is about a 1 in 2^80 chance of it being non-zero).
+
+Returns 1 if the key pair was generated successfully, 0 if an error occurred.
+*/
+int uECC_make_key(uint8_t *public_key, uint8_t *private_key, uECC_Curve curve);
+
+/* uECC_shared_secret() function.
+Compute a shared secret given your secret key and someone else's public key.
+Note: It is recommended that you hash the result of uECC_shared_secret() before using it for
+symmetric encryption or HMAC.
+
+Inputs:
+    public_key  - The public key of the remote party.
+    private_key - Your private key.
+
+Outputs:
+    secret - Will be filled in with the shared secret value. Must be the same size as the
+             curve size; for example, if the curve is secp256r1, secret must be 32 bytes long.
+
+Returns 1 if the shared secret was generated successfully, 0 if an error occurred.
+*/
+int uECC_shared_secret(const uint8_t *public_key,
+                       const uint8_t *private_key,
+                       uint8_t *secret,
+                       uECC_Curve curve);
+
+#if uECC_SUPPORT_COMPRESSED_POINT
+/* uECC_compress() function.
+Compress a public key.
+
+Inputs:
+    public_key - The public key to compress.
+
+Outputs:
+    compressed - Will be filled in with the compressed public key. Must be at least
+                 (curve size + 1) bytes long; for example, if the curve is secp256r1,
+                 compressed must be 33 bytes long.
+*/
+void uECC_compress(const uint8_t *public_key, uint8_t *compressed, uECC_Curve curve);
+
+/* uECC_decompress() function.
+Decompress a compressed public key.
+
+Inputs:
+    compressed - The compressed public key.
+
+Outputs:
+    public_key - Will be filled in with the decompressed public key.
+*/
+void uECC_decompress(const uint8_t *compressed, uint8_t *public_key, uECC_Curve curve);
+#endif /* uECC_SUPPORT_COMPRESSED_POINT */
+
+#ifndef NO_SIGNING
+/* uECC_valid_public_key() function.
+Check to see if a public key is valid.
+
+Note that you are not required to check for a valid public key before using any other uECC
+functions. However, you may wish to avoid spending CPU time computing a shared secret or
+verifying a signature using an invalid public key.
+
+Inputs:
+    public_key - The public key to check.
+
+Returns 1 if the public key is valid, 0 if it is invalid.
+*/
+int uECC_valid_public_key(const uint8_t *public_key, uECC_Curve curve);
+
+/* uECC_compute_public_key() function.
+Compute the corresponding public key for a private key.
+
+Inputs:
+    private_key - The private key to compute the public key for
+
+Outputs:
+    public_key - Will be filled in with the corresponding public key
+
+Returns 1 if the key was computed successfully, 0 if an error occurred.
+*/
+int uECC_compute_public_key(const uint8_t *private_key, uint8_t *public_key, uECC_Curve curve);
+
+/* uECC_sign() function.
+Generate an ECDSA signature for a given hash value.
+
+Usage: Compute a hash of the data you wish to sign (SHA-2 is recommended) and pass it in to
+this function along with your private key.
+
+Inputs:
+    private_key  - Your private key.
+    message_hash - The hash of the message to sign.
+    hash_size    - The size of message_hash in bytes.
+
+Outputs:
+    signature - Will be filled in with the signature value. Must be at least 2 * curve size long.
+                For example, if the curve is secp256r1, signature must be 64 bytes long.
+
+Returns 1 if the signature generated successfully, 0 if an error occurred.
+*/
+int uECC_sign(const uint8_t *private_key,
+              const uint8_t *message_hash,
+              unsigned hash_size,
+              uint8_t *signature,
+              uECC_Curve curve);
+
+/* uECC_HashContext structure.
+This is used to pass in an arbitrary hash function to uECC_sign_deterministic().
+The structure will be used for multiple hash computations; each time a new hash
+is computed, init_hash() will be called, followed by one or more calls to
+update_hash(), and finally a call to finish_hash() to prudoce the resulting hash.
+
+The intention is that you will create a structure that includes uECC_HashContext
+followed by any hash-specific data. For example:
+
+typedef struct SHA256_HashContext {
+    uECC_HashContext uECC;
+    SHA256_CTX ctx;
+} SHA256_HashContext;
+
+void init_SHA256(uECC_HashContext *base) {
+    SHA256_HashContext *context = (SHA256_HashContext *)base;
+    SHA256_Init(&context->ctx);
+}
+
+void update_SHA256(uECC_HashContext *base,
+                   const uint8_t *message,
+                   unsigned message_size) {
+    SHA256_HashContext *context = (SHA256_HashContext *)base;
+    SHA256_Update(&context->ctx, message, message_size);
+}
+
+void finish_SHA256(uECC_HashContext *base, uint8_t *hash_result) {
+    SHA256_HashContext *context = (SHA256_HashContext *)base;
+    SHA256_Final(hash_result, &context->ctx);
+}
+
+... when signing ...
+{
+    uint8_t tmp[32 + 32 + 64];
+    SHA256_HashContext ctx = {{&init_SHA256, &update_SHA256, &finish_SHA256, 64, 32, tmp}};
+    uECC_sign_deterministic(key, message_hash, &ctx.uECC, signature);
+}
+*/
+typedef struct uECC_HashContext {
+    void (*init_hash)(struct uECC_HashContext *context);
+    void (*update_hash)(struct uECC_HashContext *context,
+                        const uint8_t *message,
+                        unsigned message_size);
+    void (*finish_hash)(struct uECC_HashContext *context, uint8_t *hash_result);
+    unsigned block_size; /* Hash function block size in bytes, eg 64 for SHA-256. */
+    unsigned result_size; /* Hash function result size in bytes, eg 32 for SHA-256. */
+    uint8_t *tmp; /* Must point to a buffer of at least (2 * result_size + block_size) bytes. */
+} uECC_HashContext;
+
+/* uECC_sign_deterministic() function.
+Generate an ECDSA signature for a given hash value, using a deterministic algorithm
+(see RFC 6979). You do not need to set the RNG using uECC_set_rng() before calling
+this function; however, if the RNG is defined it will improve resistance to side-channel
+attacks.
+
+Usage: Compute a hash of the data you wish to sign (SHA-2 is recommended) and pass it in to
+this function along with your private key and a hash context. Note that the message_hash
+does not need to be computed with the same hash function used by hash_context.
+
+Inputs:
+    private_key  - Your private key.
+    message_hash - The hash of the message to sign.
+    hash_size    - The size of message_hash in bytes.
+    hash_context - A hash context to use.
+
+Outputs:
+    signature - Will be filled in with the signature value.
+
+Returns 1 if the signature generated successfully, 0 if an error occurred.
+*/
+int uECC_sign_deterministic(const uint8_t *private_key,
+                            const uint8_t *message_hash,
+                            unsigned hash_size,
+                            uECC_HashContext *hash_context,
+                            uint8_t *signature,
+                            uECC_Curve curve);
+#endif
+
+/* uECC_verify() function.
+Verify an ECDSA signature.
+
+Usage: Compute the hash of the signed data using the same hash as the signer and
+pass it to this function along with the signer's public key and the signature values (r and s).
+
+Inputs:
+    public_key   - The signer's public key.
+    message_hash - The hash of the signed data.
+    hash_size    - The size of message_hash in bytes.
+    signature    - The signature value.
+
+Returns 1 if the signature is valid, 0 if it is invalid.
+*/
+int uECC_verify(const uint8_t *public_key,
+                const uint8_t *message_hash,
+                unsigned hash_size,
+                const uint8_t *signature,
+                uECC_Curve curve);
+
+extern uint8_t uECC_recid;		// PDG XXX hack
+
+#ifdef __cplusplus
+} /* end of extern "C" */
+#endif
+
+#endif /* _UECC_H_ */
--- a/stm32/mk4-bootloader/micro-ecc/uECC_vli.h
+++ b/stm32/mk4-bootloader/micro-ecc/uECC_vli.h
@ -0,0 +1,170 @@
+/* Copyright 2015, Kenneth MacKay. Licensed under the BSD 2-clause license. */
+
+#ifndef _UECC_VLI_H_
+#define _UECC_VLI_H_
+
+#include "uECC.h"
+#include "types.h"
+
+/* Functions for raw large-integer manipulation. These are only available
+   if uECC.c is compiled with uECC_ENABLE_VLI_API defined to 1. */
+#ifndef uECC_ENABLE_VLI_API
+    #define uECC_ENABLE_VLI_API 0
+#endif
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#if uECC_ENABLE_VLI_API
+
+void uECC_vli_clear(uECC_word_t *vli, wordcount_t num_words);
+
+/* Constant-time comparison to zero - secure way to compare long integers */
+/* Returns 1 if vli == 0, 0 otherwise. */
+uECC_word_t uECC_vli_isZero(const uECC_word_t *vli, wordcount_t num_words);
+
+/* Returns nonzero if bit 'bit' of vli is set. */
+uECC_word_t uECC_vli_testBit(const uECC_word_t *vli, bitcount_t bit);
+
+/* Counts the number of bits required to represent vli. */
+bitcount_t uECC_vli_numBits(const uECC_word_t *vli, const wordcount_t max_words);
+
+/* Sets dest = src. */
+void uECC_vli_set(uECC_word_t *dest, const uECC_word_t *src, wordcount_t num_words);
+
+/* Constant-time comparison function - secure way to compare long integers */
+/* Returns one if left == right, zero otherwise */
+uECC_word_t uECC_vli_equal(const uECC_word_t *left,
+                           const uECC_word_t *right,
+                           wordcount_t num_words);
+
+/* Constant-time comparison function - secure way to compare long integers */
+/* Returns sign of left - right, in constant time. */
+cmpresult_t uECC_vli_cmp(const uECC_word_t *left, const uECC_word_t *right, wordcount_t num_words);
+
+/* Computes vli = vli >> 1. */
+void uECC_vli_rshift1(uECC_word_t *vli, wordcount_t num_words);
+
+/* Computes result = left + right, returning carry. Can modify in place. */
+uECC_word_t uECC_vli_add(uECC_word_t *result,
+                         const uECC_word_t *left,
+                         const uECC_word_t *right,
+                         wordcount_t num_words);
+
+/* Computes result = left - right, returning borrow. Can modify in place. */
+uECC_word_t uECC_vli_sub(uECC_word_t *result,
+                         const uECC_word_t *left,
+                         const uECC_word_t *right,
+                         wordcount_t num_words);
+
+/* Computes result = left * right. Result must be 2 * num_words long. */
+void uECC_vli_mult(uECC_word_t *result,
+                   const uECC_word_t *left,
+                   const uECC_word_t *right,
+                   wordcount_t num_words);
+
+/* Computes result = left^2. Result must be 2 * num_words long. */
+void uECC_vli_square(uECC_word_t *result, const uECC_word_t *left, wordcount_t num_words);
+
+/* Computes result = (left + right) % mod.
+   Assumes that left < mod and right < mod, and that result does not overlap mod. */
+void uECC_vli_modAdd(uECC_word_t *result,
+                     const uECC_word_t *left,
+                     const uECC_word_t *right,
+                     const uECC_word_t *mod,
+                     wordcount_t num_words);
+
+/* Computes result = (left - right) % mod.
+   Assumes that left < mod and right < mod, and that result does not overlap mod. */
+void uECC_vli_modSub(uECC_word_t *result,
+                     const uECC_word_t *left,
+                     const uECC_word_t *right,
+                     const uECC_word_t *mod,
+                     wordcount_t num_words);
+
+/* Computes result = product % mod, where product is 2N words long.
+   Currently only designed to work for mod == curve->p or curve_n. */
+void uECC_vli_mmod(uECC_word_t *result,
+                   uECC_word_t *product,
+                   const uECC_word_t *mod,
+                   wordcount_t num_words);
+
+/* Calculates result = product (mod curve->p), where product is up to
+   2 * curve->num_words long. */
+void uECC_vli_mmod_fast(uECC_word_t *result, uECC_word_t *product, uECC_Curve curve);
+
+/* Computes result = (left * right) % mod.
+   Currently only designed to work for mod == curve->p or curve_n. */
+void uECC_vli_modMult(uECC_word_t *result,
+                      const uECC_word_t *left,
+                      const uECC_word_t *right,
+                      const uECC_word_t *mod,
+                      wordcount_t num_words);
+
+/* Computes result = (left * right) % curve->p. */
+void uECC_vli_modMult_fast(uECC_word_t *result,
+                           const uECC_word_t *left,
+                           const uECC_word_t *right,
+                           uECC_Curve curve);
+
+/* Computes result = left^2 % mod.
+   Currently only designed to work for mod == curve->p or curve_n. */
+void uECC_vli_modSquare(uECC_word_t *result,
+                        const uECC_word_t *left,
+                        const uECC_word_t *mod,
+                        wordcount_t num_words);
+
+/* Computes result = left^2 % curve->p. */
+void uECC_vli_modSquare_fast(uECC_word_t *result, const uECC_word_t *left, uECC_Curve curve);
+
+/* Computes result = (1 / input) % mod.*/
+void uECC_vli_modInv(uECC_word_t *result,
+                     const uECC_word_t *input,
+                     const uECC_word_t *mod,
+                     wordcount_t num_words);
+
+#if uECC_SUPPORT_COMPRESSED_POINT
+/* Calculates a = sqrt(a) (mod curve->p) */
+void uECC_vli_mod_sqrt(uECC_word_t *a, uECC_Curve curve);
+#endif
+
+/* Converts an integer in uECC native format to big-endian bytes. */
+void uECC_vli_nativeToBytes(uint8_t *bytes, int num_bytes, const uECC_word_t *native);
+/* Converts big-endian bytes to an integer in uECC native format. */
+void uECC_vli_bytesToNative(uECC_word_t *native, const uint8_t *bytes, int num_bytes);
+
+unsigned uECC_curve_num_words(uECC_Curve curve);
+unsigned uECC_curve_num_bits(uECC_Curve curve);
+unsigned uECC_curve_num_n_words(uECC_Curve curve);
+unsigned uECC_curve_num_n_bits(uECC_Curve curve);
+
+const uECC_word_t *uECC_curve_p(uECC_Curve curve);
+const uECC_word_t *uECC_curve_n(uECC_Curve curve);
+const uECC_word_t *uECC_curve_G(uECC_Curve curve);
+const uECC_word_t *uECC_curve_b(uECC_Curve curve);
+
+int uECC_valid_point(const uECC_word_t *point, uECC_Curve curve);
+
+/* Multiplies a point by a scalar. Points are represented by the X coordinate followed by
+   the Y coordinate in the same array, both coordinates are curve->num_words long. Note
+   that scalar must be curve->num_n_words long (NOT curve->num_words). */
+void uECC_point_mult(uECC_word_t *result,
+                     const uECC_word_t *point,
+                     const uECC_word_t *scalar,
+                     uECC_Curve curve);
+
+/* Generates a random integer in the range 0 < random < top.
+   Both random and top have num_words words. */
+int uECC_generate_random_int(uECC_word_t *random,
+                             const uECC_word_t *top,
+                             wordcount_t num_words);
+
+#endif /* uECC_ENABLE_VLI_API */
+
+#ifdef __cplusplus
+} /* end of extern "C" */
+#endif
+
+#endif /* _UECC_VLI_H_ */