diff --git a/Makefile b/Makefile index 05a9715507..0acf12c366 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,11 @@ FLAGS_COMMON:=-Wall -FLAGS_PROD:=-DNDEBUG -g0 -O3 -march=native +FLAGS_PROD:=-DNDEBUG -O2 -march=native FLAGS_DEBUG:=-DVERIFY_MAGNITUDE -ggdb3 -O1 -FLAGS_TEST:=-DVERIFY_MAGNITUDE -ggdb3 -O3 -march=native +FLAGS_TEST:=-DVERIFY_MAGNITUDE -ggdb3 -O2 -march=native SECP256K1_FILES := num.h field.h group.h ecmult.h ecdsa.h \ num.cpp field.cpp group.cpp ecmult.cpp ecdsa.cpp \ + lin64.asm ifndef CONF CONF := gmp @@ -22,7 +23,7 @@ SECP256K1_FILES := $(SECP256K1_FILES) num_gmp.h num_gmp.cpp endif endif -all: *.cpp *.h +all: *.cpp *.asm *.h +make CONF=openssl all-openssl +make CONF=gmp all-gmp @@ -41,8 +42,11 @@ clean-$(CONF): obj/secp256k1-$(CONF).o: $(SECP256K1_FILES) $(CXX) $(FLAGS_COMMON) $(FLAGS_PROD) $(FLAGS_CONF) secp256k1.cpp -c -o obj/secp256k1-$(CONF).o -bench-$(CONF): obj/secp256k1-$(CONF).o bench.cpp - $(CXX) $(FLAGS_COMMON) $(FLAGS_PROD) $(FLAGS_CONF) obj/secp256k1-$(CONF).o bench.cpp $(LIBS) -o bench-$(CONF) +obj/lin64.o: lin64.asm + ~/tmp/jwasm -Fo obj/lin64.o -elf64 lin64.asm -tests-$(CONF): $(SECP256K1_FILES) tests.cpp - $(CXX) $(FLAGS_COMMON) $(FLAGS_TEST) $(FLAGS_CONF) tests.cpp $(LIBS) -o tests-$(CONF) +bench-$(CONF): obj/secp256k1-$(CONF).o bench.cpp obj/lin64.o + $(CXX) $(FLAGS_COMMON) $(FLAGS_PROD) $(FLAGS_CONF) obj/secp256k1-$(CONF).o obj/lin64.o bench.cpp $(LIBS) -o bench-$(CONF) + +tests-$(CONF): $(SECP256K1_FILES) tests.cpp obj/lin64.o + $(CXX) $(FLAGS_COMMON) $(FLAGS_TEST) $(FLAGS_CONF) obj/lin64.o tests.cpp $(LIBS) -o tests-$(CONF) diff --git a/field.cpp b/field.cpp index b57ffc755a..37d71795f1 100644 --- a/field.cpp +++ b/field.cpp @@ -3,11 +3,13 @@ using namespace std; #include #include #include - +#define INLINE_ASM +#include "lin64.h" #include "num.h" #include "field.h" - +#include // #define VERIFY_MAGNITUDE 1 +using namespace std; namespace secp256k1 { @@ -164,7 +166,11 @@ void FieldElem::SetMult(const FieldElem &a, const FieldElem &b) { assert(a.magnitude <= 8); assert(b.magnitude <= 8); #endif - __int128 c = (__int128)a.n[0] * b.n[0]; + +#ifdef INLINE_ASM + _ExSetMult((uint64_t *) a.n,(uint64_t *) b.n, (uint64_t *) n); +#else + unsigned __int128 c = (__int128)a.n[0] * b.n[0]; uint64_t t0 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0FFFFFFFFFFFFFE0 c = c + (__int128)a.n[0] * b.n[1] + (__int128)a.n[1] * b.n[0]; @@ -199,8 +205,8 @@ void FieldElem::SetMult(const FieldElem &a, const FieldElem &b) { c = c + (__int128)a.n[4] * b.n[4]; uint64_t t8 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 001000000000001E uint64_t t9 = c; - c = t0 + (__int128)t5 * 0x1000003D10ULL; + c = t0 + (__int128)t5 * 0x1000003D10ULL; t0 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0000001000003D10 c = c + t1 + (__int128)t6 * 0x1000003D10ULL; t1 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0000001000003D10 @@ -213,6 +219,7 @@ void FieldElem::SetMult(const FieldElem &a, const FieldElem &b) { c = t0 + (__int128)c * 0x1000003D1ULL; n[0] = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 1000008 n[1] = t1 + c; +#endif #ifdef VERIFY_MAGNITUDE magnitude = 1; normalized = false; @@ -223,6 +230,10 @@ void FieldElem::SetSquare(const FieldElem &a) { #ifdef VERIFY_MAGNITUDE assert(a.magnitude <= 8); #endif + +#ifdef INLINE_ASM + _ExSetSquare((uint64_t *)a.n,(uint64_t *)n); +#else __int128 c = (__int128)a.n[0] * a.n[0]; uint64_t t0 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0FFFFFFFFFFFFFE0 c = c + (__int128)(a.n[0]*2) * a.n[1]; @@ -261,6 +272,8 @@ void FieldElem::SetSquare(const FieldElem &a) { c = t0 + (__int128)c * 0x1000003D1ULL; n[0] = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 1000008 n[1] = t1 + c; +#endif + #ifdef VERIFY_MAGNITUDE assert(a.magnitude <= 8); normalized = false; diff --git a/field.h b/field.h index 933a2e8449..5191900b18 100644 --- a/field.h +++ b/field.h @@ -13,7 +13,7 @@ using namespace std; namespace secp256k1 { /** Implements arithmetic modulo FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2F, - * represented as 5 uint64_t's in base 2^52. The values are allowed to contain >52 each. In particular, + * represented as 5 uint64_t's in base 2^52. he values are allowed to contain >52 each. In particular, * each FieldElem has a 'magnitude' associated with it. Internally, a magnitude M means each element * is at most M*(2^53-1), except the most significant one, which is limited to M*(2^49-1). All operations * accept any input with magnitude at most M, and have different rules for propagating magnitude to their diff --git a/lin64.asm b/lin64.asm new file mode 100644 index 0000000000..79083bd7d2 --- /dev/null +++ b/lin64.asm @@ -0,0 +1,432 @@ + .x64 +QTEST EQU 1 + .code + ;; Register Layout: + ;; INPUT: rdi = a.n + ;; rsi = b.n + ;; rdx = this.a + ;; OUTPUT: [rbx] + ;; INTERNAL: rdx:rax = multiplication accumulator + ;; rsi = b.n / t9 + ;; r8:r9 = c + ;; r10-r15 = t0-t5 + ;; rbx = t6 + ;; rcx = t7 + ;; rbp = t8 +ExSetMult PROC C PUBLIC USES rbx rbp r12 r13 r14 r15 + push rdx + mov r14,[rsi+8*0] + + ;; c=a.n[0] * b.n[0] + mov rax,[rdi+0*8] + mov rbp,0FFFFFFFFFFFFFh + mul r14 ; rsi=b.n[0] + mov r15,[rsi+1*8] + mov r10,rbp + mov r8,rax + and r10,rax ; only need lower qword + shrd r8,rdx,52 + xor r9,r9 + + ;; c+=a.n[0] * b.n[1] + a.n[1] * b.n[0] + mov rax,[rdi+0*8] + mul r15 ; b.n[1] + add r8,rax + adc r9,rdx + + mov rax,[rdi+1*8] + mul r14 ; b.n[0] + mov r11,rbp + mov rbx,[rsi+2*8] + add r8,rax + adc r9,rdx + and r11,r8 + shrd r8,r9,52 + xor r9,r9 + + ;; c+=a.n[0 1 2] * b.n[2 1 0] + mov rax,[rdi+0*8] + mul rbx ; b.n[2] + add r8,rax + adc r9,rdx + + mov rax,[rdi+1*8] + mul r15 ; b.n[1] + add r8,rax + adc r9,rdx + + mov rax,[rdi+2*8] + mul r14 + mov r12,rbp ; modulus + mov rcx,[rsi+3*8] + add r8,rax + adc r9,rdx + and r12,r8 ; only need lower dword + shrd r8,r9,52 + xor r9,r9 + + ;; c+=a.n[0 1 2 3] * b.n[3 2 1 0] + mov rax,[rdi+0*8] + mul rcx ; b.n[3] + add r8,rax + adc r9,rdx + + mov rax,[rdi+1*8] + mul rbx ; b.n[2] + add r8,rax + adc r9,rdx + + mov rax,[rdi+2*8] + mul r15 ; b.n[1] + add r8,rax + adc r9,rdx + + mov rax,[rdi+3*8] + mul r14 ; b.n[0] + mov r13,rbp ; modulus + mov rsi,[rsi+4*8] ; load b.n[4] and destroy pointer + add r8,rax + adc r9,rdx + and r13,r8 + + shrd r8,r9,52 + xor r9,r9 + + + ;; c+=a.n[0 1 2 3 4] * b.n[4 3 2 1 0] + mov rax,[rdi+0*8] + mul rsi + add r8,rax + adc r9,rdx + + mov rax,[rdi+1*8] + mul rcx + add r8,rax + adc r9,rdx + + mov rax,[rdi+2*8] + mul rbx ; b.n[2] + add r8,rax + adc r9,rdx + + mov rax,[rdi+3*8] + mul r15 ; b.n[1] + add r8,rax + adc r9,rdx + + mov rax,[rdi+4*8] + mul r14 ; b.n[0] + mov r14,rbp ; modulus + add r8,rax + adc r9,rdx + and r14,r8 + shrd r8,r9,52 + xor r9,r9 + + ;; c+=a.n[1 2 3 4] * b.n[4 3 2 1] + mov rax,[rdi+1*8] + mul rsi + add r8,rax + adc r9,rdx + + mov rax,[rdi+2*8] + mul rcx + add r8,rax + adc r9,rdx + + mov rax,[rdi+3*8] + mul rbx + add r8,rax + adc r9,rdx + + mov rax,[rdi+4*8] + mul r15 + mov r15,rbp ; modulus + add r8,rax + adc r9,rdx + + and r15,r8 + shrd r8,r9,52 + xor r9,r9 + + ;; c+=a.n[2 3 4] * b.n[4 3 2] + mov rax,[rdi+2*8] + mul rsi + add r8,rax + adc r9,rdx + + mov rax,[rdi+3*8] + mul rcx + add r8,rax + adc r9,rdx + + mov rax,[rdi+4*8] + mul rbx + mov rbx,rbp ; modulus + add r8,rax + adc r9,rdx + + and rbx,r8 ; only need lower dword + shrd r8,r9,52 + xor r9,r9 + + ;; c+=a.n[3 4] * b.n[4 3] + mov rax,[rdi+3*8] + mul rsi + add r8,rax + adc r9,rdx + + mov rax,[rdi+4*8] + mul rcx + mov rcx,rbp ; modulus + add r8,rax + adc r9,rdx + and rcx,r8 ; only need lower dword + shrd r8,r9,52 + xor r9,r9 + + ;; c+=a.n[4] * b.n[4] + mov rax,[rdi+4*8] + mul rsi + ;; mov rbp,rbp ; modulus already there! + add r8,rax + adc r9,rdx + and rbp,r8 + shrd r8,r9,52 + xor r9,r9 + + mov rsi,r8 + + ;; ******************************************************* +common_exit_norm:: + mov rdi,01000003D10h + + mov rax,r15 ; get t5 + mul rdi + add rax,r10 ; +t0 + adc rdx,0 + mov r10,0FFFFFFFFFFFFFh ; modulus + mov r8,rax ; +c + and r10,rax + shrd r8,rdx,52 + xor r9,r9 + + mov rax,rbx ; get t6 + mul rdi + add rax,r11 ; +t1 + adc rdx,0 + mov r11,0FFFFFFFFFFFFFh ; modulus + add r8,rax ; +c + adc r9,rdx + and r11,r8 + shrd r8,r9,52 + xor r9,r9 + + mov rax,rcx ; get t7 + mul rdi + add rax,r12 ; +t2 + adc rdx,0 + pop rbx ; retrieve pointer to this.a.n + mov r12,0FFFFFFFFFFFFFh ; modulus + add r8,rax ; +c + adc r9,rdx + and r12,r8 + mov [rbx+2*8],r12 + shrd r8,r9,52 + xor r9,r9 + + mov rax,rbp ; get t8 + mul rdi + add rax,r13 ; +t3 + adc rdx,0 + mov r13,0FFFFFFFFFFFFFh ; modulus + add r8,rax ; +c + adc r9,rdx + and r13,r8 + mov [rbx+3*8],r13 + shrd r8,r9,52 + xor r9,r9 + + mov rax,rsi ; get t9 + mul rdi + add rax,r14 ; +t4 + adc rdx,0 + mov r14,0FFFFFFFFFFFFh ; !!! + add r8,rax ; +c + adc r9,rdx + and r14,r8 + mov [rbx+4*8],r14 + shrd r8,r9,48 + xor r9,r9 + + mov rax,01000003D1h + mul r8 + add rax,r10 + adc rdx,0 + mov r10,0FFFFFFFFFFFFFh ; modulus + mov r8,rax + and rax,r10 + shrd r8,rdx,52 + mov [rbx+0*8],rax + add r8,r11 + mov [rbx+1*8],r8 + ret +ExSetMult ENDP + + + + + + + ;; Register Layout: + ;; INPUT: rdi = a.n + ;; rsi = this.a + ;; OUTPUT: [rsi] + ;; INTERNAL: rdx:rax = multiplication accumulator + ;; r8:r9 = c + ;; r10-r14 = t0-t4 + ;; r15 = a.n[0]*2 / t5 + ;; rbx = a.n[1]*2 / t6 + ;; rcx = a.n[2]*2 / t7 + ;; rbp = a.n[3]*2 / t8 + ;; rsi = a.n[4] / t9 +ExSetSquare PROC C PUBLIC USES rbx rbp r12 r13 r14 r15 + push rsi + mov rsi,0FFFFFFFFFFFFFh + + ;; c=a.n[0] * a.n[0] + mov r15,[rdi+0*8] + mov r10,rsi ; modulus + mov rax,r15 + mul rax ; rsi=b.n[0] + mov rbx,[rdi+1*8] ; a.n[1] + add r15,r15 ; r15=2*a.n[0] + mov r8,rax + and r10,rax ; only need lower qword + shrd r8,rdx,52 + xor r9,r9 + + ;; c+=2*a.n[0] * a.n[1] + mov rax,r15 + mul rbx + mov rcx,[rdi+2*8] ; rcx=a.n[2] + mov r11,rsi ; modulus + add r8,rax + adc r9,rdx + and r11,r8 + shrd r8,r9,52 + xor r9,r9 + + ;; c+=2*a.n[0]*a.n[2]+a.n[1]*a.n[1] + mov rax,r15 + mul rcx + add r8,rax + adc r9,rdx + + mov rax,rbx + mov r12,rsi ; modulus + mul rax + mov rbp,[rdi+3*8] ; rbp=a.n[3] + add rbx,rbx ; rbx=a.n[1]*2 + add r8,rax + adc r9,rdx + + and r12,r8 ; only need lower dword + shrd r8,r9,52 + xor r9,r9 + + ;; c+=2*a.n[0]*a.n[3]+2*a.n[1]*a.n[2] + mov rax,r15 + mul rbp + add r8,rax + adc r9,rdx + + mov rax,rbx ; rax=2*a.n[1] + mov r13,rsi ; modulus + mul rcx + mov rsi,[rdi+4*8] ; rsi=a.n[4] / destroy constant + add r8,rax + adc r9,rdx + and r13,r8 + shrd r8,r9,52 + xor r9,r9 + + ;; c+=2*a.n[0]*a.n[4]+2*a.n[1]*a.n[3]+a.n[2]*a.n[2] + mov rax,r15 ; last time we need 2*a.n[0] + mul rsi + add r8,rax + adc r9,rdx + + mov rax,rbx + mul rbp + mov r14,0FFFFFFFFFFFFFh ; modulus + add r8,rax + adc r9,rdx + + mov rax,rcx + mul rax + add rcx,rcx ; rcx=2*a.n[2] + add r8,rax + adc r9,rdx + and r14,r8 + shrd r8,r9,52 + xor r9,r9 + + ;; c+=2*a.n[1]*a.n[4]+2*a.n[2]*a.n[3] + mov rax,rbx + mul rsi + add r8,rax + adc r9,rdx + + mov rax,rcx + mul rbp + mov r15,0FFFFFFFFFFFFFh ; modulus + add r8,rax + adc r9,rdx + and r15,r8 + shrd r8,r9,52 + xor r9,r9 + + ;; c+=2*a.n[2]*a.n[4]+a.n[3]*a.n[3] + mov rax,rcx ; 2*a.n[2] + mul rsi + add r8,rax + adc r9,rdx + + mov rax,rbp ; a.n[3] + mul rax + mov rbx,0FFFFFFFFFFFFFh ; modulus + add r8,rax + adc r9,rdx + and rbx,r8 ; only need lower dword + lea rax,[2*rbp] + shrd r8,r9,52 + xor r9,r9 + + ;; c+=2*a.n[3]*a.n[4] + mul rsi + mov rcx,0FFFFFFFFFFFFFh ; modulus + add r8,rax + adc r9,rdx + and rcx,r8 ; only need lower dword + shrd r8,r9,52 + xor r9,r9 + + ;; c+=a.n[4]*a.n[4] + mov rax,rsi + mul rax + mov rbp,0FFFFFFFFFFFFFh ; modulus + add r8,rax + adc r9,rdx + and rbp,r8 + shrd r8,r9,52 + xor r9,r9 + + mov rsi,r8 + + ;; ******************************************************* + jmp common_exit_norm +ExSetSquare ENDP + end + + diff --git a/lin64.h b/lin64.h new file mode 100644 index 0000000000..45fadda7ab --- /dev/null +++ b/lin64.h @@ -0,0 +1,10 @@ +#ifndef _SECP256K1_LIN64 +#define _SECP256K1_LIN64 + +using namespace std; + +#ifdef INLINE_ASM +extern "C" void _ExSetMult(uint64_t *, uint64_t *, uint64_t *); +extern "C" void _ExSetSquare(uint64_t *, uint64_t *); +#endif +#endif