Merge pull request #142

f22d73e Explicitly access %0..%2 as 64-bit so we use the right registers for x32 ABI (Luke Dashjr)
e66d4d6 Avoid the stack in assembly and use explicit registers (Pieter Wuille)
This commit is contained in:
Pieter Wuille 2014-12-11 00:49:22 +01:00
commit 3907277dcf
No known key found for this signature in database
GPG key ID: 57896D2FF8F0B657

View file

@ -21,19 +21,16 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t
* r15:rcx = d
* r10-r14 = a0-a4
* rbx = b
* %2 = r
* %0 = a / t?
* rbp = R (0x1000003d10)
* rdi = r
* rsi = a / t?
*/
uint64_t tmp1, tmp2, tmp3;
__asm__ __volatile__(
"pushq %%rbp\n"
"movq 0(%0),%%r10\n"
"movq 8(%0),%%r11\n"
"movq 16(%0),%%r12\n"
"movq 24(%0),%%r13\n"
"movq 32(%0),%%r14\n"
"movq $0x1000003d10,%%rbp\n"
"movq 0(%%rsi),%%r10\n"
"movq 8(%%rsi),%%r11\n"
"movq 16(%%rsi),%%r12\n"
"movq 24(%%rsi),%%r13\n"
"movq 32(%%rsi),%%r14\n"
/* d += a3 * b0 */
"movq 0(%%rbx),%%rax\n"
@ -63,16 +60,17 @@ __asm__ __volatile__(
/* d += (c & M) * R */
"movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%%rax\n"
"mulq %%rbp\n"
"movq $0x1000003d10,%%rdx\n"
"mulq %%rdx\n"
"addq %%rax,%%rcx\n"
"adcq %%rdx,%%r15\n"
/* c >>= 52 (%%r8 only) */
"shrdq $52,%%r9,%%r8\n"
/* t3 (stack) = d & M */
"movq %%rcx,%0\n"
/* t3 (tmp1) = d & M */
"movq %%rcx,%%rsi\n"
"movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%0\n"
"pushq %0\n"
"andq %%rdx,%%rsi\n"
"movq %%rsi,%q1\n"
/* d >>= 52 */
"shrdq $52,%%r15,%%rcx\n"
"xorq %%r15,%%r15\n"
@ -103,23 +101,25 @@ __asm__ __volatile__(
"adcq %%rdx,%%r15\n"
/* d += c * R */
"movq %%r8,%%rax\n"
"mulq %%rbp\n"
"movq $0x1000003d10,%%rdx\n"
"mulq %%rdx\n"
"addq %%rax,%%rcx\n"
"adcq %%rdx,%%r15\n"
/* t4 = d & M (%0) */
"movq %%rcx,%0\n"
/* t4 = d & M (%%rsi) */
"movq %%rcx,%%rsi\n"
"movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%0\n"
"andq %%rdx,%%rsi\n"
/* d >>= 52 */
"shrdq $52,%%r15,%%rcx\n"
"xorq %%r15,%%r15\n"
/* tx = t4 >> 48 (%%rbp, overwrites R) */
"movq %0,%%rbp\n"
"shrq $48,%%rbp\n"
/* t4 &= (M >> 4) (stack) */
/* tx = t4 >> 48 (tmp3) */
"movq %%rsi,%%rax\n"
"shrq $48,%%rax\n"
"movq %%rax,%q3\n"
/* t4 &= (M >> 4) (tmp2) */
"movq $0xffffffffffff,%%rax\n"
"andq %%rax,%0\n"
"pushq %0\n"
"andq %%rax,%%rsi\n"
"movq %%rsi,%q2\n"
/* c = a0 * b0 */
"movq 0(%%rbx),%%rax\n"
"mulq %%r10\n"
@ -145,26 +145,27 @@ __asm__ __volatile__(
"mulq %%r11\n"
"addq %%rax,%%rcx\n"
"adcq %%rdx,%%r15\n"
/* u0 = d & M (%0) */
"movq %%rcx,%0\n"
/* u0 = d & M (%%rsi) */
"movq %%rcx,%%rsi\n"
"movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%0\n"
"andq %%rdx,%%rsi\n"
/* d >>= 52 */
"shrdq $52,%%r15,%%rcx\n"
"xorq %%r15,%%r15\n"
/* u0 = (u0 << 4) | tx (%0) */
"shlq $4,%0\n"
"orq %%rbp,%0\n"
/* u0 = (u0 << 4) | tx (%%rsi) */
"shlq $4,%%rsi\n"
"movq %q3,%%rax\n"
"orq %%rax,%%rsi\n"
/* c += u0 * (R >> 4) */
"movq $0x1000003d1,%%rax\n"
"mulq %0\n"
"mulq %%rsi\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* r[0] = c & M */
"movq %%r8,%%rax\n"
"movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%%rax\n"
"movq %%rax,0(%2)\n"
"movq %%rax,0(%%rdi)\n"
/* c >>= 52 */
"shrdq $52,%%r9,%%r8\n"
"xorq %%r9,%%r9\n"
@ -193,13 +194,12 @@ __asm__ __volatile__(
"mulq %%r12\n"
"addq %%rax,%%rcx\n"
"adcq %%rdx,%%r15\n"
/* restore rdp = R */
"movq $0x1000003d10,%%rbp\n"
/* c += (d & M) * R */
"movq %%rcx,%%rax\n"
"movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%%rax\n"
"mulq %%rbp\n"
"movq $0x1000003d10,%%rdx\n"
"mulq %%rdx\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* d >>= 52 */
@ -209,7 +209,7 @@ __asm__ __volatile__(
"movq %%r8,%%rax\n"
"movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%%rax\n"
"movq %%rax,8(%2)\n"
"movq %%rax,8(%%rdi)\n"
/* c >>= 52 */
"shrdq $52,%%r9,%%r8\n"
"xorq %%r9,%%r9\n"
@ -228,9 +228,9 @@ __asm__ __volatile__(
"mulq %%r10\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* fetch t3 (%%r10, overwrites a0),t4 (%0) */
"popq %0\n"
"popq %%r10\n"
/* fetch t3 (%%r10, overwrites a0), t4 (%%rsi) */
"movq %q2,%%rsi\n"
"movq %q1,%%r10\n"
/* d += a4 * b3 */
"movq 24(%%rbx),%%rax\n"
"mulq %%r14\n"
@ -245,7 +245,8 @@ __asm__ __volatile__(
"movq %%rcx,%%rax\n"
"movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%%rax\n"
"mulq %%rbp\n"
"movq $0x1000003d10,%%rdx\n"
"mulq %%rdx\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* d >>= 52 (%%rcx only) */
@ -254,7 +255,7 @@ __asm__ __volatile__(
"movq %%r8,%%rax\n"
"movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%%rax\n"
"movq %%rax,16(%2)\n"
"movq %%rax,16(%%rdi)\n"
/* c >>= 52 */
"shrdq $52,%%r9,%%r8\n"
"xorq %%r9,%%r9\n"
@ -262,23 +263,22 @@ __asm__ __volatile__(
"addq %%r10,%%r8\n"
/* c += d * R */
"movq %%rcx,%%rax\n"
"mulq %%rbp\n"
"movq $0x1000003d10,%%rdx\n"
"mulq %%rdx\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* r[3] = c & M */
"movq %%r8,%%rax\n"
"movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%%rax\n"
"movq %%rax,24(%2)\n"
"movq %%rax,24(%%rdi)\n"
/* c >>= 52 (%%r8 only) */
"shrdq $52,%%r9,%%r8\n"
/* c += t4 (%%r8 only) */
"addq %0,%%r8\n"
"addq %%rsi,%%r8\n"
/* r[4] = c */
"movq %%r8,32(%2)\n"
"popq %%rbp\n"
: "+S"(a)
"movq %%r8,32(%%rdi)\n"
: "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3)
: "b"(b), "D"(r)
: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory"
);
@ -291,19 +291,16 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t
* rcx:rbx = d
* r10-r14 = a0-a4
* r15 = M (0xfffffffffffff)
* %1 = r
* %0 = a / t?
* rbp = R (0x1000003d10)
* rdi = r
* rsi = a / t?
*/
uint64_t tmp1, tmp2, tmp3;
__asm__ __volatile__(
"pushq %%rbp\n"
"movq 0(%0),%%r10\n"
"movq 8(%0),%%r11\n"
"movq 16(%0),%%r12\n"
"movq 24(%0),%%r13\n"
"movq 32(%0),%%r14\n"
"movq $0x1000003d10,%%rbp\n"
"movq 0(%%rsi),%%r10\n"
"movq 8(%%rsi),%%r11\n"
"movq 16(%%rsi),%%r12\n"
"movq 24(%%rsi),%%r13\n"
"movq 32(%%rsi),%%r14\n"
"movq $0xfffffffffffff,%%r15\n"
/* d = (a0*2) * a3 */
@ -323,15 +320,16 @@ __asm__ __volatile__(
"movq %%rdx,%%r9\n"
/* d += (c & M) * R */
"andq %%r15,%%rax\n"
"mulq %%rbp\n"
"movq $0x1000003d10,%%rdx\n"
"mulq %%rdx\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* c >>= 52 (%%r8 only) */
"shrdq $52,%%r9,%%r8\n"
/* t3 (stack) = d & M */
"movq %%rbx,%0\n"
"andq %%r15,%0\n"
"pushq %0\n"
/* t3 (tmp1) = d & M */
"movq %%rbx,%%rsi\n"
"andq %%r15,%%rsi\n"
"movq %%rsi,%q1\n"
/* d >>= 52 */
"shrdq $52,%%rcx,%%rbx\n"
"xorq %%rcx,%%rcx\n"
@ -354,22 +352,24 @@ __asm__ __volatile__(
"adcq %%rdx,%%rcx\n"
/* d += c * R */
"movq %%r8,%%rax\n"
"mulq %%rbp\n"
"movq $0x1000003d10,%%rdx\n"
"mulq %%rdx\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* t4 = d & M (%0) */
"movq %%rbx,%0\n"
"andq %%r15,%0\n"
/* t4 = d & M (%%rsi) */
"movq %%rbx,%%rsi\n"
"andq %%r15,%%rsi\n"
/* d >>= 52 */
"shrdq $52,%%rcx,%%rbx\n"
"xorq %%rcx,%%rcx\n"
/* tx = t4 >> 48 (%%rbp, overwrites constant) */
"movq %0,%%rbp\n"
"shrq $48,%%rbp\n"
/* t4 &= (M >> 4) (stack) */
/* tx = t4 >> 48 (tmp3) */
"movq %%rsi,%%rax\n"
"shrq $48,%%rax\n"
"movq %%rax,%q3\n"
/* t4 &= (M >> 4) (tmp2) */
"movq $0xffffffffffff,%%rax\n"
"andq %%rax,%0\n"
"pushq %0\n"
"andq %%rax,%%rsi\n"
"movq %%rsi,%q2\n"
/* c = a0 * a0 */
"movq %%r10,%%rax\n"
"mulq %%r10\n"
@ -385,24 +385,25 @@ __asm__ __volatile__(
"mulq %%r13\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* u0 = d & M (%0) */
"movq %%rbx,%0\n"
"andq %%r15,%0\n"
/* u0 = d & M (%%rsi) */
"movq %%rbx,%%rsi\n"
"andq %%r15,%%rsi\n"
/* d >>= 52 */
"shrdq $52,%%rcx,%%rbx\n"
"xorq %%rcx,%%rcx\n"
/* u0 = (u0 << 4) | tx (%0) */
"shlq $4,%0\n"
"orq %%rbp,%0\n"
/* u0 = (u0 << 4) | tx (%%rsi) */
"shlq $4,%%rsi\n"
"movq %q3,%%rax\n"
"orq %%rax,%%rsi\n"
/* c += u0 * (R >> 4) */
"movq $0x1000003d1,%%rax\n"
"mulq %0\n"
"mulq %%rsi\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* r[0] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,0(%1)\n"
"movq %%rax,0(%%rdi)\n"
/* c >>= 52 */
"shrdq $52,%%r9,%%r8\n"
"xorq %%r9,%%r9\n"
@ -423,12 +424,11 @@ __asm__ __volatile__(
"mulq %%r13\n"
"addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n"
/* load R in %%rbp */
"movq $0x1000003d10,%%rbp\n"
/* c += (d & M) * R */
"movq %%rbx,%%rax\n"
"andq %%r15,%%rax\n"
"mulq %%rbp\n"
"movq $0x1000003d10,%%rdx\n"
"mulq %%rdx\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* d >>= 52 */
@ -437,7 +437,7 @@ __asm__ __volatile__(
/* r[1] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,8(%1)\n"
"movq %%rax,8(%%rdi)\n"
/* c >>= 52 */
"shrdq $52,%%r9,%%r8\n"
"xorq %%r9,%%r9\n"
@ -446,9 +446,9 @@ __asm__ __volatile__(
"mulq %%r12\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* fetch t3 (%%r10, overwrites a0),t4 (%0) */
"popq %0\n"
"popq %%r10\n"
/* fetch t3 (%%r10, overwrites a0),t4 (%%rsi) */
"movq %q2,%%rsi\n"
"movq %q1,%%r10\n"
/* c += a1 * a1 */
"movq %%r11,%%rax\n"
"mulq %%r11\n"
@ -462,7 +462,8 @@ __asm__ __volatile__(
/* c += (d & M) * R */
"movq %%rbx,%%rax\n"
"andq %%r15,%%rax\n"
"mulq %%rbp\n"
"movq $0x1000003d10,%%rdx\n"
"mulq %%rdx\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* d >>= 52 (%%rbx only) */
@ -470,7 +471,7 @@ __asm__ __volatile__(
/* r[2] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,16(%1)\n"
"movq %%rax,16(%%rdi)\n"
/* c >>= 52 */
"shrdq $52,%%r9,%%r8\n"
"xorq %%r9,%%r9\n"
@ -478,22 +479,21 @@ __asm__ __volatile__(
"addq %%r10,%%r8\n"
/* c += d * R */
"movq %%rbx,%%rax\n"
"mulq %%rbp\n"
"movq $0x1000003d10,%%rdx\n"
"mulq %%rdx\n"
"addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n"
/* r[3] = c & M */
"movq %%r8,%%rax\n"
"andq %%r15,%%rax\n"
"movq %%rax,24(%1)\n"
"movq %%rax,24(%%rdi)\n"
/* c >>= 52 (%%r8 only) */
"shrdq $52,%%r9,%%r8\n"
/* c += t4 (%%r8 only) */
"addq %0,%%r8\n"
"addq %%rsi,%%r8\n"
/* r[4] = c */
"movq %%r8,32(%1)\n"
"popq %%rbp\n"
: "+S"(a)
"movq %%r8,32(%%rdi)\n"
: "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3)
: "D"(r)
: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory"
);