From e66d4d6d391f951a3de9b1b08b5f678130f68eb3 Mon Sep 17 00:00:00 2001 From: Pieter Wuille Date: Mon, 8 Dec 2014 15:18:55 +0100 Subject: [PATCH] Avoid the stack in assembly and use explicit registers --- src/field_5x52_asm_impl.h | 200 +++++++++++++++++++------------------- 1 file changed, 100 insertions(+), 100 deletions(-) diff --git a/src/field_5x52_asm_impl.h b/src/field_5x52_asm_impl.h index af12fd30d2..da289e8ff4 100644 --- a/src/field_5x52_asm_impl.h +++ b/src/field_5x52_asm_impl.h @@ -21,19 +21,16 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t * r15:rcx = d * r10-r14 = a0-a4 * rbx = b - * %2 = r - * %0 = a / t? - * rbp = R (0x1000003d10) + * rdi = r + * rsi = a / t? */ + uint64_t tmp1, tmp2, tmp3; __asm__ __volatile__( - "pushq %%rbp\n" - - "movq 0(%0),%%r10\n" - "movq 8(%0),%%r11\n" - "movq 16(%0),%%r12\n" - "movq 24(%0),%%r13\n" - "movq 32(%0),%%r14\n" - "movq $0x1000003d10,%%rbp\n" + "movq 0(%%rsi),%%r10\n" + "movq 8(%%rsi),%%r11\n" + "movq 16(%%rsi),%%r12\n" + "movq 24(%%rsi),%%r13\n" + "movq 32(%%rsi),%%r14\n" /* d += a3 * b0 */ "movq 0(%%rbx),%%rax\n" @@ -63,16 +60,17 @@ __asm__ __volatile__( /* d += (c & M) * R */ "movq $0xfffffffffffff,%%rdx\n" "andq %%rdx,%%rax\n" - "mulq %%rbp\n" + "movq $0x1000003d10,%%rdx\n" + "mulq %%rdx\n" "addq %%rax,%%rcx\n" "adcq %%rdx,%%r15\n" /* c >>= 52 (%%r8 only) */ "shrdq $52,%%r9,%%r8\n" - /* t3 (stack) = d & M */ - "movq %%rcx,%0\n" + /* t3 (tmp1) = d & M */ + "movq %%rcx,%%rsi\n" "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%0\n" - "pushq %0\n" + "andq %%rdx,%%rsi\n" + "movq %%rsi,%1\n" /* d >>= 52 */ "shrdq $52,%%r15,%%rcx\n" "xorq %%r15,%%r15\n" @@ -103,23 +101,25 @@ __asm__ __volatile__( "adcq %%rdx,%%r15\n" /* d += c * R */ "movq %%r8,%%rax\n" - "mulq %%rbp\n" + "movq $0x1000003d10,%%rdx\n" + "mulq %%rdx\n" "addq %%rax,%%rcx\n" "adcq %%rdx,%%r15\n" - /* t4 = d & M (%0) */ - "movq %%rcx,%0\n" + /* t4 = d & M (%%rsi) */ + "movq %%rcx,%%rsi\n" "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%0\n" + "andq %%rdx,%%rsi\n" /* d >>= 52 */ "shrdq $52,%%r15,%%rcx\n" "xorq %%r15,%%r15\n" - /* tx = t4 >> 48 (%%rbp, overwrites R) */ - "movq %0,%%rbp\n" - "shrq $48,%%rbp\n" - /* t4 &= (M >> 4) (stack) */ + /* tx = t4 >> 48 (tmp3) */ + "movq %%rsi,%%rax\n" + "shrq $48,%%rax\n" + "movq %%rax,%3\n" + /* t4 &= (M >> 4) (tmp2) */ "movq $0xffffffffffff,%%rax\n" - "andq %%rax,%0\n" - "pushq %0\n" + "andq %%rax,%%rsi\n" + "movq %%rsi,%2\n" /* c = a0 * b0 */ "movq 0(%%rbx),%%rax\n" "mulq %%r10\n" @@ -145,26 +145,27 @@ __asm__ __volatile__( "mulq %%r11\n" "addq %%rax,%%rcx\n" "adcq %%rdx,%%r15\n" - /* u0 = d & M (%0) */ - "movq %%rcx,%0\n" + /* u0 = d & M (%%rsi) */ + "movq %%rcx,%%rsi\n" "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%0\n" + "andq %%rdx,%%rsi\n" /* d >>= 52 */ "shrdq $52,%%r15,%%rcx\n" "xorq %%r15,%%r15\n" - /* u0 = (u0 << 4) | tx (%0) */ - "shlq $4,%0\n" - "orq %%rbp,%0\n" + /* u0 = (u0 << 4) | tx (%%rsi) */ + "shlq $4,%%rsi\n" + "movq %3,%%rax\n" + "orq %%rax,%%rsi\n" /* c += u0 * (R >> 4) */ "movq $0x1000003d1,%%rax\n" - "mulq %0\n" + "mulq %%rsi\n" "addq %%rax,%%r8\n" "adcq %%rdx,%%r9\n" /* r[0] = c & M */ "movq %%r8,%%rax\n" "movq $0xfffffffffffff,%%rdx\n" "andq %%rdx,%%rax\n" - "movq %%rax,0(%2)\n" + "movq %%rax,0(%%rdi)\n" /* c >>= 52 */ "shrdq $52,%%r9,%%r8\n" "xorq %%r9,%%r9\n" @@ -193,13 +194,12 @@ __asm__ __volatile__( "mulq %%r12\n" "addq %%rax,%%rcx\n" "adcq %%rdx,%%r15\n" - /* restore rdp = R */ - "movq $0x1000003d10,%%rbp\n" /* c += (d & M) * R */ "movq %%rcx,%%rax\n" "movq $0xfffffffffffff,%%rdx\n" "andq %%rdx,%%rax\n" - "mulq %%rbp\n" + "movq $0x1000003d10,%%rdx\n" + "mulq %%rdx\n" "addq %%rax,%%r8\n" "adcq %%rdx,%%r9\n" /* d >>= 52 */ @@ -209,7 +209,7 @@ __asm__ __volatile__( "movq %%r8,%%rax\n" "movq $0xfffffffffffff,%%rdx\n" "andq %%rdx,%%rax\n" - "movq %%rax,8(%2)\n" + "movq %%rax,8(%%rdi)\n" /* c >>= 52 */ "shrdq $52,%%r9,%%r8\n" "xorq %%r9,%%r9\n" @@ -228,9 +228,9 @@ __asm__ __volatile__( "mulq %%r10\n" "addq %%rax,%%r8\n" "adcq %%rdx,%%r9\n" - /* fetch t3 (%%r10, overwrites a0),t4 (%0) */ - "popq %0\n" - "popq %%r10\n" + /* fetch t3 (%%r10, overwrites a0), t4 (%%rsi) */ + "movq %2,%%rsi\n" + "movq %1,%%r10\n" /* d += a4 * b3 */ "movq 24(%%rbx),%%rax\n" "mulq %%r14\n" @@ -245,7 +245,8 @@ __asm__ __volatile__( "movq %%rcx,%%rax\n" "movq $0xfffffffffffff,%%rdx\n" "andq %%rdx,%%rax\n" - "mulq %%rbp\n" + "movq $0x1000003d10,%%rdx\n" + "mulq %%rdx\n" "addq %%rax,%%r8\n" "adcq %%rdx,%%r9\n" /* d >>= 52 (%%rcx only) */ @@ -254,7 +255,7 @@ __asm__ __volatile__( "movq %%r8,%%rax\n" "movq $0xfffffffffffff,%%rdx\n" "andq %%rdx,%%rax\n" - "movq %%rax,16(%2)\n" + "movq %%rax,16(%%rdi)\n" /* c >>= 52 */ "shrdq $52,%%r9,%%r8\n" "xorq %%r9,%%r9\n" @@ -262,23 +263,22 @@ __asm__ __volatile__( "addq %%r10,%%r8\n" /* c += d * R */ "movq %%rcx,%%rax\n" - "mulq %%rbp\n" + "movq $0x1000003d10,%%rdx\n" + "mulq %%rdx\n" "addq %%rax,%%r8\n" "adcq %%rdx,%%r9\n" /* r[3] = c & M */ "movq %%r8,%%rax\n" "movq $0xfffffffffffff,%%rdx\n" "andq %%rdx,%%rax\n" - "movq %%rax,24(%2)\n" + "movq %%rax,24(%%rdi)\n" /* c >>= 52 (%%r8 only) */ "shrdq $52,%%r9,%%r8\n" /* c += t4 (%%r8 only) */ - "addq %0,%%r8\n" + "addq %%rsi,%%r8\n" /* r[4] = c */ - "movq %%r8,32(%2)\n" - - "popq %%rbp\n" -: "+S"(a) + "movq %%r8,32(%%rdi)\n" +: "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3) : "b"(b), "D"(r) : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory" ); @@ -291,19 +291,16 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t * rcx:rbx = d * r10-r14 = a0-a4 * r15 = M (0xfffffffffffff) - * %1 = r - * %0 = a / t? - * rbp = R (0x1000003d10) + * rdi = r + * rsi = a / t? */ + uint64_t tmp1, tmp2, tmp3; __asm__ __volatile__( - "pushq %%rbp\n" - - "movq 0(%0),%%r10\n" - "movq 8(%0),%%r11\n" - "movq 16(%0),%%r12\n" - "movq 24(%0),%%r13\n" - "movq 32(%0),%%r14\n" - "movq $0x1000003d10,%%rbp\n" + "movq 0(%%rsi),%%r10\n" + "movq 8(%%rsi),%%r11\n" + "movq 16(%%rsi),%%r12\n" + "movq 24(%%rsi),%%r13\n" + "movq 32(%%rsi),%%r14\n" "movq $0xfffffffffffff,%%r15\n" /* d = (a0*2) * a3 */ @@ -323,15 +320,16 @@ __asm__ __volatile__( "movq %%rdx,%%r9\n" /* d += (c & M) * R */ "andq %%r15,%%rax\n" - "mulq %%rbp\n" + "movq $0x1000003d10,%%rdx\n" + "mulq %%rdx\n" "addq %%rax,%%rbx\n" "adcq %%rdx,%%rcx\n" /* c >>= 52 (%%r8 only) */ "shrdq $52,%%r9,%%r8\n" - /* t3 (stack) = d & M */ - "movq %%rbx,%0\n" - "andq %%r15,%0\n" - "pushq %0\n" + /* t3 (tmp1) = d & M */ + "movq %%rbx,%%rsi\n" + "andq %%r15,%%rsi\n" + "movq %%rsi,%1\n" /* d >>= 52 */ "shrdq $52,%%rcx,%%rbx\n" "xorq %%rcx,%%rcx\n" @@ -354,22 +352,24 @@ __asm__ __volatile__( "adcq %%rdx,%%rcx\n" /* d += c * R */ "movq %%r8,%%rax\n" - "mulq %%rbp\n" + "movq $0x1000003d10,%%rdx\n" + "mulq %%rdx\n" "addq %%rax,%%rbx\n" "adcq %%rdx,%%rcx\n" - /* t4 = d & M (%0) */ - "movq %%rbx,%0\n" - "andq %%r15,%0\n" + /* t4 = d & M (%%rsi) */ + "movq %%rbx,%%rsi\n" + "andq %%r15,%%rsi\n" /* d >>= 52 */ "shrdq $52,%%rcx,%%rbx\n" "xorq %%rcx,%%rcx\n" - /* tx = t4 >> 48 (%%rbp, overwrites constant) */ - "movq %0,%%rbp\n" - "shrq $48,%%rbp\n" - /* t4 &= (M >> 4) (stack) */ + /* tx = t4 >> 48 (tmp3) */ + "movq %%rsi,%%rax\n" + "shrq $48,%%rax\n" + "movq %%rax,%3\n" + /* t4 &= (M >> 4) (tmp2) */ "movq $0xffffffffffff,%%rax\n" - "andq %%rax,%0\n" - "pushq %0\n" + "andq %%rax,%%rsi\n" + "movq %%rsi,%2\n" /* c = a0 * a0 */ "movq %%r10,%%rax\n" "mulq %%r10\n" @@ -385,24 +385,25 @@ __asm__ __volatile__( "mulq %%r13\n" "addq %%rax,%%rbx\n" "adcq %%rdx,%%rcx\n" - /* u0 = d & M (%0) */ - "movq %%rbx,%0\n" - "andq %%r15,%0\n" + /* u0 = d & M (%%rsi) */ + "movq %%rbx,%%rsi\n" + "andq %%r15,%%rsi\n" /* d >>= 52 */ "shrdq $52,%%rcx,%%rbx\n" "xorq %%rcx,%%rcx\n" - /* u0 = (u0 << 4) | tx (%0) */ - "shlq $4,%0\n" - "orq %%rbp,%0\n" + /* u0 = (u0 << 4) | tx (%%rsi) */ + "shlq $4,%%rsi\n" + "movq %3,%%rax\n" + "orq %%rax,%%rsi\n" /* c += u0 * (R >> 4) */ "movq $0x1000003d1,%%rax\n" - "mulq %0\n" + "mulq %%rsi\n" "addq %%rax,%%r8\n" "adcq %%rdx,%%r9\n" /* r[0] = c & M */ "movq %%r8,%%rax\n" "andq %%r15,%%rax\n" - "movq %%rax,0(%1)\n" + "movq %%rax,0(%%rdi)\n" /* c >>= 52 */ "shrdq $52,%%r9,%%r8\n" "xorq %%r9,%%r9\n" @@ -423,12 +424,11 @@ __asm__ __volatile__( "mulq %%r13\n" "addq %%rax,%%rbx\n" "adcq %%rdx,%%rcx\n" - /* load R in %%rbp */ - "movq $0x1000003d10,%%rbp\n" /* c += (d & M) * R */ "movq %%rbx,%%rax\n" "andq %%r15,%%rax\n" - "mulq %%rbp\n" + "movq $0x1000003d10,%%rdx\n" + "mulq %%rdx\n" "addq %%rax,%%r8\n" "adcq %%rdx,%%r9\n" /* d >>= 52 */ @@ -437,7 +437,7 @@ __asm__ __volatile__( /* r[1] = c & M */ "movq %%r8,%%rax\n" "andq %%r15,%%rax\n" - "movq %%rax,8(%1)\n" + "movq %%rax,8(%%rdi)\n" /* c >>= 52 */ "shrdq $52,%%r9,%%r8\n" "xorq %%r9,%%r9\n" @@ -446,9 +446,9 @@ __asm__ __volatile__( "mulq %%r12\n" "addq %%rax,%%r8\n" "adcq %%rdx,%%r9\n" - /* fetch t3 (%%r10, overwrites a0),t4 (%0) */ - "popq %0\n" - "popq %%r10\n" + /* fetch t3 (%%r10, overwrites a0),t4 (%%rsi) */ + "movq %2,%%rsi\n" + "movq %1,%%r10\n" /* c += a1 * a1 */ "movq %%r11,%%rax\n" "mulq %%r11\n" @@ -462,7 +462,8 @@ __asm__ __volatile__( /* c += (d & M) * R */ "movq %%rbx,%%rax\n" "andq %%r15,%%rax\n" - "mulq %%rbp\n" + "movq $0x1000003d10,%%rdx\n" + "mulq %%rdx\n" "addq %%rax,%%r8\n" "adcq %%rdx,%%r9\n" /* d >>= 52 (%%rbx only) */ @@ -470,7 +471,7 @@ __asm__ __volatile__( /* r[2] = c & M */ "movq %%r8,%%rax\n" "andq %%r15,%%rax\n" - "movq %%rax,16(%1)\n" + "movq %%rax,16(%%rdi)\n" /* c >>= 52 */ "shrdq $52,%%r9,%%r8\n" "xorq %%r9,%%r9\n" @@ -478,22 +479,21 @@ __asm__ __volatile__( "addq %%r10,%%r8\n" /* c += d * R */ "movq %%rbx,%%rax\n" - "mulq %%rbp\n" + "movq $0x1000003d10,%%rdx\n" + "mulq %%rdx\n" "addq %%rax,%%r8\n" "adcq %%rdx,%%r9\n" /* r[3] = c & M */ "movq %%r8,%%rax\n" "andq %%r15,%%rax\n" - "movq %%rax,24(%1)\n" + "movq %%rax,24(%%rdi)\n" /* c >>= 52 (%%r8 only) */ "shrdq $52,%%r9,%%r8\n" /* c += t4 (%%r8 only) */ - "addq %0,%%r8\n" + "addq %%rsi,%%r8\n" /* r[4] = c */ - "movq %%r8,32(%1)\n" - - "popq %%rbp\n" -: "+S"(a) + "movq %%r8,32(%%rdi)\n" +: "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3) : "D"(r) : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory" );