Merge pull request #142

f22d73e Explicitly access %0..%2 as 64-bit so we use the right registers for x32 ABI (Luke Dashjr)
e66d4d6 Avoid the stack in assembly and use explicit registers (Pieter Wuille)
This commit is contained in:
Pieter Wuille 2014-12-11 00:49:22 +01:00
commit 3907277dcf
No known key found for this signature in database
GPG key ID: 57896D2FF8F0B657

View file

@ -21,19 +21,16 @@ SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t
* r15:rcx = d * r15:rcx = d
* r10-r14 = a0-a4 * r10-r14 = a0-a4
* rbx = b * rbx = b
* %2 = r * rdi = r
* %0 = a / t? * rsi = a / t?
* rbp = R (0x1000003d10)
*/ */
uint64_t tmp1, tmp2, tmp3;
__asm__ __volatile__( __asm__ __volatile__(
"pushq %%rbp\n" "movq 0(%%rsi),%%r10\n"
"movq 8(%%rsi),%%r11\n"
"movq 0(%0),%%r10\n" "movq 16(%%rsi),%%r12\n"
"movq 8(%0),%%r11\n" "movq 24(%%rsi),%%r13\n"
"movq 16(%0),%%r12\n" "movq 32(%%rsi),%%r14\n"
"movq 24(%0),%%r13\n"
"movq 32(%0),%%r14\n"
"movq $0x1000003d10,%%rbp\n"
/* d += a3 * b0 */ /* d += a3 * b0 */
"movq 0(%%rbx),%%rax\n" "movq 0(%%rbx),%%rax\n"
@ -63,16 +60,17 @@ __asm__ __volatile__(
/* d += (c & M) * R */ /* d += (c & M) * R */
"movq $0xfffffffffffff,%%rdx\n" "movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%%rax\n" "andq %%rdx,%%rax\n"
"mulq %%rbp\n" "movq $0x1000003d10,%%rdx\n"
"mulq %%rdx\n"
"addq %%rax,%%rcx\n" "addq %%rax,%%rcx\n"
"adcq %%rdx,%%r15\n" "adcq %%rdx,%%r15\n"
/* c >>= 52 (%%r8 only) */ /* c >>= 52 (%%r8 only) */
"shrdq $52,%%r9,%%r8\n" "shrdq $52,%%r9,%%r8\n"
/* t3 (stack) = d & M */ /* t3 (tmp1) = d & M */
"movq %%rcx,%0\n" "movq %%rcx,%%rsi\n"
"movq $0xfffffffffffff,%%rdx\n" "movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%0\n" "andq %%rdx,%%rsi\n"
"pushq %0\n" "movq %%rsi,%q1\n"
/* d >>= 52 */ /* d >>= 52 */
"shrdq $52,%%r15,%%rcx\n" "shrdq $52,%%r15,%%rcx\n"
"xorq %%r15,%%r15\n" "xorq %%r15,%%r15\n"
@ -103,23 +101,25 @@ __asm__ __volatile__(
"adcq %%rdx,%%r15\n" "adcq %%rdx,%%r15\n"
/* d += c * R */ /* d += c * R */
"movq %%r8,%%rax\n" "movq %%r8,%%rax\n"
"mulq %%rbp\n" "movq $0x1000003d10,%%rdx\n"
"mulq %%rdx\n"
"addq %%rax,%%rcx\n" "addq %%rax,%%rcx\n"
"adcq %%rdx,%%r15\n" "adcq %%rdx,%%r15\n"
/* t4 = d & M (%0) */ /* t4 = d & M (%%rsi) */
"movq %%rcx,%0\n" "movq %%rcx,%%rsi\n"
"movq $0xfffffffffffff,%%rdx\n" "movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%0\n" "andq %%rdx,%%rsi\n"
/* d >>= 52 */ /* d >>= 52 */
"shrdq $52,%%r15,%%rcx\n" "shrdq $52,%%r15,%%rcx\n"
"xorq %%r15,%%r15\n" "xorq %%r15,%%r15\n"
/* tx = t4 >> 48 (%%rbp, overwrites R) */ /* tx = t4 >> 48 (tmp3) */
"movq %0,%%rbp\n" "movq %%rsi,%%rax\n"
"shrq $48,%%rbp\n" "shrq $48,%%rax\n"
/* t4 &= (M >> 4) (stack) */ "movq %%rax,%q3\n"
/* t4 &= (M >> 4) (tmp2) */
"movq $0xffffffffffff,%%rax\n" "movq $0xffffffffffff,%%rax\n"
"andq %%rax,%0\n" "andq %%rax,%%rsi\n"
"pushq %0\n" "movq %%rsi,%q2\n"
/* c = a0 * b0 */ /* c = a0 * b0 */
"movq 0(%%rbx),%%rax\n" "movq 0(%%rbx),%%rax\n"
"mulq %%r10\n" "mulq %%r10\n"
@ -145,26 +145,27 @@ __asm__ __volatile__(
"mulq %%r11\n" "mulq %%r11\n"
"addq %%rax,%%rcx\n" "addq %%rax,%%rcx\n"
"adcq %%rdx,%%r15\n" "adcq %%rdx,%%r15\n"
/* u0 = d & M (%0) */ /* u0 = d & M (%%rsi) */
"movq %%rcx,%0\n" "movq %%rcx,%%rsi\n"
"movq $0xfffffffffffff,%%rdx\n" "movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%0\n" "andq %%rdx,%%rsi\n"
/* d >>= 52 */ /* d >>= 52 */
"shrdq $52,%%r15,%%rcx\n" "shrdq $52,%%r15,%%rcx\n"
"xorq %%r15,%%r15\n" "xorq %%r15,%%r15\n"
/* u0 = (u0 << 4) | tx (%0) */ /* u0 = (u0 << 4) | tx (%%rsi) */
"shlq $4,%0\n" "shlq $4,%%rsi\n"
"orq %%rbp,%0\n" "movq %q3,%%rax\n"
"orq %%rax,%%rsi\n"
/* c += u0 * (R >> 4) */ /* c += u0 * (R >> 4) */
"movq $0x1000003d1,%%rax\n" "movq $0x1000003d1,%%rax\n"
"mulq %0\n" "mulq %%rsi\n"
"addq %%rax,%%r8\n" "addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n" "adcq %%rdx,%%r9\n"
/* r[0] = c & M */ /* r[0] = c & M */
"movq %%r8,%%rax\n" "movq %%r8,%%rax\n"
"movq $0xfffffffffffff,%%rdx\n" "movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%%rax\n" "andq %%rdx,%%rax\n"
"movq %%rax,0(%2)\n" "movq %%rax,0(%%rdi)\n"
/* c >>= 52 */ /* c >>= 52 */
"shrdq $52,%%r9,%%r8\n" "shrdq $52,%%r9,%%r8\n"
"xorq %%r9,%%r9\n" "xorq %%r9,%%r9\n"
@ -193,13 +194,12 @@ __asm__ __volatile__(
"mulq %%r12\n" "mulq %%r12\n"
"addq %%rax,%%rcx\n" "addq %%rax,%%rcx\n"
"adcq %%rdx,%%r15\n" "adcq %%rdx,%%r15\n"
/* restore rdp = R */
"movq $0x1000003d10,%%rbp\n"
/* c += (d & M) * R */ /* c += (d & M) * R */
"movq %%rcx,%%rax\n" "movq %%rcx,%%rax\n"
"movq $0xfffffffffffff,%%rdx\n" "movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%%rax\n" "andq %%rdx,%%rax\n"
"mulq %%rbp\n" "movq $0x1000003d10,%%rdx\n"
"mulq %%rdx\n"
"addq %%rax,%%r8\n" "addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n" "adcq %%rdx,%%r9\n"
/* d >>= 52 */ /* d >>= 52 */
@ -209,7 +209,7 @@ __asm__ __volatile__(
"movq %%r8,%%rax\n" "movq %%r8,%%rax\n"
"movq $0xfffffffffffff,%%rdx\n" "movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%%rax\n" "andq %%rdx,%%rax\n"
"movq %%rax,8(%2)\n" "movq %%rax,8(%%rdi)\n"
/* c >>= 52 */ /* c >>= 52 */
"shrdq $52,%%r9,%%r8\n" "shrdq $52,%%r9,%%r8\n"
"xorq %%r9,%%r9\n" "xorq %%r9,%%r9\n"
@ -228,9 +228,9 @@ __asm__ __volatile__(
"mulq %%r10\n" "mulq %%r10\n"
"addq %%rax,%%r8\n" "addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n" "adcq %%rdx,%%r9\n"
/* fetch t3 (%%r10, overwrites a0),t4 (%0) */ /* fetch t3 (%%r10, overwrites a0), t4 (%%rsi) */
"popq %0\n" "movq %q2,%%rsi\n"
"popq %%r10\n" "movq %q1,%%r10\n"
/* d += a4 * b3 */ /* d += a4 * b3 */
"movq 24(%%rbx),%%rax\n" "movq 24(%%rbx),%%rax\n"
"mulq %%r14\n" "mulq %%r14\n"
@ -245,7 +245,8 @@ __asm__ __volatile__(
"movq %%rcx,%%rax\n" "movq %%rcx,%%rax\n"
"movq $0xfffffffffffff,%%rdx\n" "movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%%rax\n" "andq %%rdx,%%rax\n"
"mulq %%rbp\n" "movq $0x1000003d10,%%rdx\n"
"mulq %%rdx\n"
"addq %%rax,%%r8\n" "addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n" "adcq %%rdx,%%r9\n"
/* d >>= 52 (%%rcx only) */ /* d >>= 52 (%%rcx only) */
@ -254,7 +255,7 @@ __asm__ __volatile__(
"movq %%r8,%%rax\n" "movq %%r8,%%rax\n"
"movq $0xfffffffffffff,%%rdx\n" "movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%%rax\n" "andq %%rdx,%%rax\n"
"movq %%rax,16(%2)\n" "movq %%rax,16(%%rdi)\n"
/* c >>= 52 */ /* c >>= 52 */
"shrdq $52,%%r9,%%r8\n" "shrdq $52,%%r9,%%r8\n"
"xorq %%r9,%%r9\n" "xorq %%r9,%%r9\n"
@ -262,23 +263,22 @@ __asm__ __volatile__(
"addq %%r10,%%r8\n" "addq %%r10,%%r8\n"
/* c += d * R */ /* c += d * R */
"movq %%rcx,%%rax\n" "movq %%rcx,%%rax\n"
"mulq %%rbp\n" "movq $0x1000003d10,%%rdx\n"
"mulq %%rdx\n"
"addq %%rax,%%r8\n" "addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n" "adcq %%rdx,%%r9\n"
/* r[3] = c & M */ /* r[3] = c & M */
"movq %%r8,%%rax\n" "movq %%r8,%%rax\n"
"movq $0xfffffffffffff,%%rdx\n" "movq $0xfffffffffffff,%%rdx\n"
"andq %%rdx,%%rax\n" "andq %%rdx,%%rax\n"
"movq %%rax,24(%2)\n" "movq %%rax,24(%%rdi)\n"
/* c >>= 52 (%%r8 only) */ /* c >>= 52 (%%r8 only) */
"shrdq $52,%%r9,%%r8\n" "shrdq $52,%%r9,%%r8\n"
/* c += t4 (%%r8 only) */ /* c += t4 (%%r8 only) */
"addq %0,%%r8\n" "addq %%rsi,%%r8\n"
/* r[4] = c */ /* r[4] = c */
"movq %%r8,32(%2)\n" "movq %%r8,32(%%rdi)\n"
: "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3)
"popq %%rbp\n"
: "+S"(a)
: "b"(b), "D"(r) : "b"(b), "D"(r)
: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory" : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory"
); );
@ -291,19 +291,16 @@ SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t
* rcx:rbx = d * rcx:rbx = d
* r10-r14 = a0-a4 * r10-r14 = a0-a4
* r15 = M (0xfffffffffffff) * r15 = M (0xfffffffffffff)
* %1 = r * rdi = r
* %0 = a / t? * rsi = a / t?
* rbp = R (0x1000003d10)
*/ */
uint64_t tmp1, tmp2, tmp3;
__asm__ __volatile__( __asm__ __volatile__(
"pushq %%rbp\n" "movq 0(%%rsi),%%r10\n"
"movq 8(%%rsi),%%r11\n"
"movq 0(%0),%%r10\n" "movq 16(%%rsi),%%r12\n"
"movq 8(%0),%%r11\n" "movq 24(%%rsi),%%r13\n"
"movq 16(%0),%%r12\n" "movq 32(%%rsi),%%r14\n"
"movq 24(%0),%%r13\n"
"movq 32(%0),%%r14\n"
"movq $0x1000003d10,%%rbp\n"
"movq $0xfffffffffffff,%%r15\n" "movq $0xfffffffffffff,%%r15\n"
/* d = (a0*2) * a3 */ /* d = (a0*2) * a3 */
@ -323,15 +320,16 @@ __asm__ __volatile__(
"movq %%rdx,%%r9\n" "movq %%rdx,%%r9\n"
/* d += (c & M) * R */ /* d += (c & M) * R */
"andq %%r15,%%rax\n" "andq %%r15,%%rax\n"
"mulq %%rbp\n" "movq $0x1000003d10,%%rdx\n"
"mulq %%rdx\n"
"addq %%rax,%%rbx\n" "addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n" "adcq %%rdx,%%rcx\n"
/* c >>= 52 (%%r8 only) */ /* c >>= 52 (%%r8 only) */
"shrdq $52,%%r9,%%r8\n" "shrdq $52,%%r9,%%r8\n"
/* t3 (stack) = d & M */ /* t3 (tmp1) = d & M */
"movq %%rbx,%0\n" "movq %%rbx,%%rsi\n"
"andq %%r15,%0\n" "andq %%r15,%%rsi\n"
"pushq %0\n" "movq %%rsi,%q1\n"
/* d >>= 52 */ /* d >>= 52 */
"shrdq $52,%%rcx,%%rbx\n" "shrdq $52,%%rcx,%%rbx\n"
"xorq %%rcx,%%rcx\n" "xorq %%rcx,%%rcx\n"
@ -354,22 +352,24 @@ __asm__ __volatile__(
"adcq %%rdx,%%rcx\n" "adcq %%rdx,%%rcx\n"
/* d += c * R */ /* d += c * R */
"movq %%r8,%%rax\n" "movq %%r8,%%rax\n"
"mulq %%rbp\n" "movq $0x1000003d10,%%rdx\n"
"mulq %%rdx\n"
"addq %%rax,%%rbx\n" "addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n" "adcq %%rdx,%%rcx\n"
/* t4 = d & M (%0) */ /* t4 = d & M (%%rsi) */
"movq %%rbx,%0\n" "movq %%rbx,%%rsi\n"
"andq %%r15,%0\n" "andq %%r15,%%rsi\n"
/* d >>= 52 */ /* d >>= 52 */
"shrdq $52,%%rcx,%%rbx\n" "shrdq $52,%%rcx,%%rbx\n"
"xorq %%rcx,%%rcx\n" "xorq %%rcx,%%rcx\n"
/* tx = t4 >> 48 (%%rbp, overwrites constant) */ /* tx = t4 >> 48 (tmp3) */
"movq %0,%%rbp\n" "movq %%rsi,%%rax\n"
"shrq $48,%%rbp\n" "shrq $48,%%rax\n"
/* t4 &= (M >> 4) (stack) */ "movq %%rax,%q3\n"
/* t4 &= (M >> 4) (tmp2) */
"movq $0xffffffffffff,%%rax\n" "movq $0xffffffffffff,%%rax\n"
"andq %%rax,%0\n" "andq %%rax,%%rsi\n"
"pushq %0\n" "movq %%rsi,%q2\n"
/* c = a0 * a0 */ /* c = a0 * a0 */
"movq %%r10,%%rax\n" "movq %%r10,%%rax\n"
"mulq %%r10\n" "mulq %%r10\n"
@ -385,24 +385,25 @@ __asm__ __volatile__(
"mulq %%r13\n" "mulq %%r13\n"
"addq %%rax,%%rbx\n" "addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n" "adcq %%rdx,%%rcx\n"
/* u0 = d & M (%0) */ /* u0 = d & M (%%rsi) */
"movq %%rbx,%0\n" "movq %%rbx,%%rsi\n"
"andq %%r15,%0\n" "andq %%r15,%%rsi\n"
/* d >>= 52 */ /* d >>= 52 */
"shrdq $52,%%rcx,%%rbx\n" "shrdq $52,%%rcx,%%rbx\n"
"xorq %%rcx,%%rcx\n" "xorq %%rcx,%%rcx\n"
/* u0 = (u0 << 4) | tx (%0) */ /* u0 = (u0 << 4) | tx (%%rsi) */
"shlq $4,%0\n" "shlq $4,%%rsi\n"
"orq %%rbp,%0\n" "movq %q3,%%rax\n"
"orq %%rax,%%rsi\n"
/* c += u0 * (R >> 4) */ /* c += u0 * (R >> 4) */
"movq $0x1000003d1,%%rax\n" "movq $0x1000003d1,%%rax\n"
"mulq %0\n" "mulq %%rsi\n"
"addq %%rax,%%r8\n" "addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n" "adcq %%rdx,%%r9\n"
/* r[0] = c & M */ /* r[0] = c & M */
"movq %%r8,%%rax\n" "movq %%r8,%%rax\n"
"andq %%r15,%%rax\n" "andq %%r15,%%rax\n"
"movq %%rax,0(%1)\n" "movq %%rax,0(%%rdi)\n"
/* c >>= 52 */ /* c >>= 52 */
"shrdq $52,%%r9,%%r8\n" "shrdq $52,%%r9,%%r8\n"
"xorq %%r9,%%r9\n" "xorq %%r9,%%r9\n"
@ -423,12 +424,11 @@ __asm__ __volatile__(
"mulq %%r13\n" "mulq %%r13\n"
"addq %%rax,%%rbx\n" "addq %%rax,%%rbx\n"
"adcq %%rdx,%%rcx\n" "adcq %%rdx,%%rcx\n"
/* load R in %%rbp */
"movq $0x1000003d10,%%rbp\n"
/* c += (d & M) * R */ /* c += (d & M) * R */
"movq %%rbx,%%rax\n" "movq %%rbx,%%rax\n"
"andq %%r15,%%rax\n" "andq %%r15,%%rax\n"
"mulq %%rbp\n" "movq $0x1000003d10,%%rdx\n"
"mulq %%rdx\n"
"addq %%rax,%%r8\n" "addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n" "adcq %%rdx,%%r9\n"
/* d >>= 52 */ /* d >>= 52 */
@ -437,7 +437,7 @@ __asm__ __volatile__(
/* r[1] = c & M */ /* r[1] = c & M */
"movq %%r8,%%rax\n" "movq %%r8,%%rax\n"
"andq %%r15,%%rax\n" "andq %%r15,%%rax\n"
"movq %%rax,8(%1)\n" "movq %%rax,8(%%rdi)\n"
/* c >>= 52 */ /* c >>= 52 */
"shrdq $52,%%r9,%%r8\n" "shrdq $52,%%r9,%%r8\n"
"xorq %%r9,%%r9\n" "xorq %%r9,%%r9\n"
@ -446,9 +446,9 @@ __asm__ __volatile__(
"mulq %%r12\n" "mulq %%r12\n"
"addq %%rax,%%r8\n" "addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n" "adcq %%rdx,%%r9\n"
/* fetch t3 (%%r10, overwrites a0),t4 (%0) */ /* fetch t3 (%%r10, overwrites a0),t4 (%%rsi) */
"popq %0\n" "movq %q2,%%rsi\n"
"popq %%r10\n" "movq %q1,%%r10\n"
/* c += a1 * a1 */ /* c += a1 * a1 */
"movq %%r11,%%rax\n" "movq %%r11,%%rax\n"
"mulq %%r11\n" "mulq %%r11\n"
@ -462,7 +462,8 @@ __asm__ __volatile__(
/* c += (d & M) * R */ /* c += (d & M) * R */
"movq %%rbx,%%rax\n" "movq %%rbx,%%rax\n"
"andq %%r15,%%rax\n" "andq %%r15,%%rax\n"
"mulq %%rbp\n" "movq $0x1000003d10,%%rdx\n"
"mulq %%rdx\n"
"addq %%rax,%%r8\n" "addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n" "adcq %%rdx,%%r9\n"
/* d >>= 52 (%%rbx only) */ /* d >>= 52 (%%rbx only) */
@ -470,7 +471,7 @@ __asm__ __volatile__(
/* r[2] = c & M */ /* r[2] = c & M */
"movq %%r8,%%rax\n" "movq %%r8,%%rax\n"
"andq %%r15,%%rax\n" "andq %%r15,%%rax\n"
"movq %%rax,16(%1)\n" "movq %%rax,16(%%rdi)\n"
/* c >>= 52 */ /* c >>= 52 */
"shrdq $52,%%r9,%%r8\n" "shrdq $52,%%r9,%%r8\n"
"xorq %%r9,%%r9\n" "xorq %%r9,%%r9\n"
@ -478,22 +479,21 @@ __asm__ __volatile__(
"addq %%r10,%%r8\n" "addq %%r10,%%r8\n"
/* c += d * R */ /* c += d * R */
"movq %%rbx,%%rax\n" "movq %%rbx,%%rax\n"
"mulq %%rbp\n" "movq $0x1000003d10,%%rdx\n"
"mulq %%rdx\n"
"addq %%rax,%%r8\n" "addq %%rax,%%r8\n"
"adcq %%rdx,%%r9\n" "adcq %%rdx,%%r9\n"
/* r[3] = c & M */ /* r[3] = c & M */
"movq %%r8,%%rax\n" "movq %%r8,%%rax\n"
"andq %%r15,%%rax\n" "andq %%r15,%%rax\n"
"movq %%rax,24(%1)\n" "movq %%rax,24(%%rdi)\n"
/* c >>= 52 (%%r8 only) */ /* c >>= 52 (%%r8 only) */
"shrdq $52,%%r9,%%r8\n" "shrdq $52,%%r9,%%r8\n"
/* c += t4 (%%r8 only) */ /* c += t4 (%%r8 only) */
"addq %0,%%r8\n" "addq %%rsi,%%r8\n"
/* r[4] = c */ /* r[4] = c */
"movq %%r8,32(%1)\n" "movq %%r8,32(%%rdi)\n"
: "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3)
"popq %%rbp\n"
: "+S"(a)
: "D"(r) : "D"(r)
: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory" : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory"
); );