x86_64 specific assembly version of field code

2025-01-12 04:42:36 -03:00 · 2013-03-26 00:39:22 +01:00 · 2013-03-26 00:39:22 +01:00 · 37c4ab156d
commit 37c4ab156d
parent 0cd6372fe8
5 changed files with 471 additions and 12 deletions
--- a/18
+++ b/18
@ -1,10 +1,11 @@
 FLAGS_COMMON:=-Wall
-FLAGS_PROD:=-DNDEBUG -g0 -O3 -march=native
+FLAGS_PROD:=-DNDEBUG -O2 -march=native
 FLAGS_DEBUG:=-DVERIFY_MAGNITUDE -ggdb3 -O1
-FLAGS_TEST:=-DVERIFY_MAGNITUDE -ggdb3 -O3 -march=native
+FLAGS_TEST:=-DVERIFY_MAGNITUDE -ggdb3 -O2 -march=native

 SECP256K1_FILES := num.h   field.h   group.h   ecmult.h   ecdsa.h \
                   num.cpp field.cpp group.cpp ecmult.cpp ecdsa.cpp \
+		   lin64.asm

 ifndef CONF
 CONF := gmp
@ -22,7 +23,7 @@ SECP256K1_FILES := $(SECP256K1_FILES) num_gmp.h num_gmp.cpp
 endif
 endif

-all: *.cpp *.h
+all: *.cpp *.asm *.h
 	+make CONF=openssl all-openssl
 	+make CONF=gmp     all-gmp

@ -41,8 +42,11 @@ clean-$(CONF):
 obj/secp256k1-$(CONF).o: $(SECP256K1_FILES)
 	$(CXX) $(FLAGS_COMMON) $(FLAGS_PROD) $(FLAGS_CONF) secp256k1.cpp -c -o obj/secp256k1-$(CONF).o

-bench-$(CONF): obj/secp256k1-$(CONF).o bench.cpp
-	$(CXX) $(FLAGS_COMMON) $(FLAGS_PROD) $(FLAGS_CONF) obj/secp256k1-$(CONF).o bench.cpp $(LIBS) -o bench-$(CONF)
+obj/lin64.o: lin64.asm
+	~/tmp/jwasm -Fo obj/lin64.o -elf64 lin64.asm 

-tests-$(CONF): $(SECP256K1_FILES) tests.cpp
-	$(CXX) $(FLAGS_COMMON) $(FLAGS_TEST) $(FLAGS_CONF) tests.cpp $(LIBS) -o tests-$(CONF)
+bench-$(CONF): obj/secp256k1-$(CONF).o bench.cpp obj/lin64.o
+	$(CXX) $(FLAGS_COMMON) $(FLAGS_PROD) $(FLAGS_CONF) obj/secp256k1-$(CONF).o obj/lin64.o bench.cpp $(LIBS) -o bench-$(CONF)
+
+tests-$(CONF): $(SECP256K1_FILES) tests.cpp obj/lin64.o
+	$(CXX) $(FLAGS_COMMON) $(FLAGS_TEST) $(FLAGS_CONF) obj/lin64.o tests.cpp $(LIBS) -o tests-$(CONF)
--- a/field.cpp
+++ b/field.cpp
@ -3,11 +3,13 @@ using namespace std;
 #include <assert.h>
 #include <stdint.h>
 #include <string>
-
+#define INLINE_ASM
+#include "lin64.h"
 #include "num.h"
 #include "field.h"
-
+#include <iostream>
 // #define VERIFY_MAGNITUDE 1
+using namespace std;

 namespace secp256k1 {

@ -164,7 +166,11 @@ void FieldElem::SetMult(const FieldElem &a, const FieldElem &b) {
    assert(a.magnitude <= 8);
    assert(b.magnitude <= 8);
 #endif
-    __int128 c = (__int128)a.n[0] * b.n[0];
+
+#ifdef INLINE_ASM
+    _ExSetMult((uint64_t *) a.n,(uint64_t *) b.n, (uint64_t *) n);
+#else
+    unsigned __int128 c = (__int128)a.n[0] * b.n[0];
    uint64_t t0 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0FFFFFFFFFFFFFE0
    c = c + (__int128)a.n[0] * b.n[1] +
            (__int128)a.n[1] * b.n[0];
@ -199,8 +205,8 @@ void FieldElem::SetMult(const FieldElem &a, const FieldElem &b) {
    c = c + (__int128)a.n[4] * b.n[4];
    uint64_t t8 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 001000000000001E
    uint64_t t9 = c;
-    c = t0 + (__int128)t5 * 0x1000003D10ULL;

+    c = t0 + (__int128)t5 * 0x1000003D10ULL;
    t0 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0000001000003D10
    c = c + t1 + (__int128)t6 * 0x1000003D10ULL;
    t1 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0000001000003D10
@ -213,6 +219,7 @@ void FieldElem::SetMult(const FieldElem &a, const FieldElem &b) {
    c = t0 + (__int128)c * 0x1000003D1ULL;
    n[0] = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 1000008
    n[1] = t1 + c;
+#endif
 #ifdef VERIFY_MAGNITUDE
    magnitude = 1;
    normalized = false;
@ -223,6 +230,10 @@ void FieldElem::SetSquare(const FieldElem &a) {
 #ifdef VERIFY_MAGNITUDE
    assert(a.magnitude <= 8);
 #endif
+
+#ifdef INLINE_ASM
+    _ExSetSquare((uint64_t *)a.n,(uint64_t *)n);
+#else
    __int128 c = (__int128)a.n[0] * a.n[0];
    uint64_t t0 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0FFFFFFFFFFFFFE0
    c = c + (__int128)(a.n[0]*2) * a.n[1];
@ -261,6 +272,8 @@ void FieldElem::SetSquare(const FieldElem &a) {
    c = t0 + (__int128)c * 0x1000003D1ULL;
    n[0] = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 1000008
    n[1] = t1 + c;
+#endif
+
 #ifdef VERIFY_MAGNITUDE
    assert(a.magnitude <= 8);
    normalized = false;
--- a/field.h
+++ b/field.h
@ -13,7 +13,7 @@ using namespace std;
 namespace secp256k1 {

 /** Implements arithmetic modulo FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2F,
- *  represented as 5 uint64_t's in base 2^52. The values are allowed to contain >52 each. In particular,
+ *  represented as 5 uint64_t's in base 2^52. he values are allowed to contain >52 each. In particular,
 *  each FieldElem has a 'magnitude' associated with it. Internally, a magnitude M means each element
 *  is at most M*(2^53-1), except the most significant one, which is limited to M*(2^49-1). All operations
 *  accept any input with magnitude at most M, and have different rules for propagating magnitude to their
--- a/lin64.asm
+++ b/lin64.asm
@ -0,0 +1,432 @@
+	.x64
+QTEST	EQU 1
+	.code
+	;;  Register Layout:
+	;;  INPUT: 	rdi	= a.n
+	;; 	   	rsi  	= b.n
+	;; 	   	rdx  	= this.a
+	;;  OUTPUT:	[rbx]
+	;;  INTERNAL:	rdx:rax  = multiplication accumulator
+	;; 		rsi	 = b.n / t9
+	;; 		r8:r9    = c
+	;; 		r10-r15  = t0-t5
+	;; 		rbx	 = t6
+	;; 		rcx	 = t7
+	;; 		rbp	 = t8
+ExSetMult	PROC C PUBLIC USES rbx rbp r12 r13 r14 r15
+	push rdx
+	mov r14,[rsi+8*0]
+	
+	;; c=a.n[0] * b.n[0]
+   	mov rax,[rdi+0*8]
+	mov rbp,0FFFFFFFFFFFFFh
+	mul r14			; rsi=b.n[0]
+	mov r15,[rsi+1*8]
+	mov r10,rbp
+	mov r8,rax
+	and r10,rax		; only need lower qword
+	shrd r8,rdx,52
+	xor r9,r9
+
+	;; c+=a.n[0] * b.n[1] + a.n[1] * b.n[0]
+	mov rax,[rdi+0*8]
+	mul r15			; b.n[1]
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+1*8]
+	mul r14			; b.n[0]
+	mov r11,rbp
+	mov rbx,[rsi+2*8]
+	add r8,rax
+	adc r9,rdx
+	and r11,r8
+	shrd r8,r9,52
+	xor r9,r9
+	
+	;; c+=a.n[0 1 2] * b.n[2 1 0]
+	mov rax,[rdi+0*8]
+	mul rbx			; b.n[2]
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+1*8]
+	mul r15			; b.n[1]	
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+2*8]
+	mul r14
+	mov r12,rbp		; modulus
+	mov rcx,[rsi+3*8]
+	add r8,rax
+	adc r9,rdx
+	and r12,r8		; only need lower dword
+	shrd r8,r9,52
+	xor r9,r9		
+
+	;; c+=a.n[0 1 2 3] * b.n[3 2 1 0]
+	mov rax,[rdi+0*8]
+	mul rcx			; b.n[3]
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+1*8]
+	mul rbx			; b.n[2]
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+2*8]
+	mul r15			; b.n[1]
+	add r8,rax
+	adc r9,rdx
+	
+	mov rax,[rdi+3*8]
+	mul r14			; b.n[0]
+	mov r13,rbp             ; modulus
+	mov rsi,[rsi+4*8]	; load b.n[4] and destroy pointer
+	add r8,rax
+	adc r9,rdx
+	and r13,r8
+
+	shrd r8,r9,52
+	xor r9,r9		
+
+
+	;; c+=a.n[0 1 2 3 4] * b.n[4 3 2 1 0]
+	mov rax,[rdi+0*8]
+	mul rsi
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+1*8]
+	mul rcx
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+2*8]
+	mul rbx			; b.n[2] 
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+3*8]
+	mul r15			; b.n[1]
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+4*8]
+	mul r14			; b.n[0]
+	mov r14,rbp             ; modulus
+	add r8,rax
+	adc r9,rdx
+	and r14,r8
+	shrd r8,r9,52
+	xor r9,r9		
+
+	;; c+=a.n[1 2 3 4] * b.n[4 3 2 1]
+	mov rax,[rdi+1*8]
+	mul rsi
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+2*8]
+	mul rcx
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+3*8]
+	mul rbx
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+4*8]
+	mul r15
+	mov r15,rbp		; modulus
+	add r8,rax
+	adc r9,rdx
+
+	and r15,r8
+	shrd r8,r9,52
+	xor r9,r9		
+
+	;; c+=a.n[2 3 4] * b.n[4 3 2]
+	mov rax,[rdi+2*8]
+	mul rsi
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+3*8]
+	mul rcx
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+4*8]
+	mul rbx
+	mov rbx,rbp		; modulus
+	add r8,rax
+	adc r9,rdx
+
+	and rbx,r8		; only need lower dword
+	shrd r8,r9,52
+	xor r9,r9		
+
+	;; c+=a.n[3 4] * b.n[4 3]
+	mov rax,[rdi+3*8]
+	mul rsi
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,[rdi+4*8]
+	mul rcx
+	mov rcx,rbp		; modulus
+	add r8,rax
+	adc r9,rdx
+	and rcx,r8		; only need lower dword
+	shrd r8,r9,52
+	xor r9,r9		
+
+	;; c+=a.n[4] * b.n[4]
+	mov rax,[rdi+4*8]
+	mul rsi
+	;; mov rbp,rbp		; modulus already there!
+	add r8,rax
+	adc r9,rdx
+	and rbp,r8 
+	shrd r8,r9,52
+	xor r9,r9		
+
+	mov rsi,r8
+
+	;; *******************************************************
+common_exit_norm::
+	mov rdi,01000003D10h
+
+	mov rax,r15		; get t5
+	mul rdi
+	add rax,r10    		; +t0
+	adc rdx,0
+	mov r10,0FFFFFFFFFFFFFh ; modulus
+	mov r8,rax		; +c
+	and r10,rax
+	shrd r8,rdx,52
+	xor r9,r9
+
+	mov rax,rbx		; get t6
+	mul rdi
+	add rax,r11		; +t1
+	adc rdx,0
+	mov r11,0FFFFFFFFFFFFFh ; modulus
+	add r8,rax		; +c
+	adc r9,rdx
+	and r11,r8
+	shrd r8,r9,52
+	xor r9,r9
+
+	mov rax,rcx    		; get t7
+	mul rdi
+	add rax,r12		; +t2
+	adc rdx,0
+	pop rbx			; retrieve pointer to this.a.n	
+	mov r12,0FFFFFFFFFFFFFh ; modulus
+	add r8,rax		; +c
+	adc r9,rdx
+	and r12,r8
+	mov [rbx+2*8],r12
+	shrd r8,r9,52
+	xor r9,r9
+	
+	mov rax,rbp    		; get t8
+	mul rdi
+	add rax,r13    		; +t3
+	adc rdx,0
+	mov r13,0FFFFFFFFFFFFFh ; modulus
+	add r8,rax		; +c
+	adc r9,rdx
+	and r13,r8
+	mov [rbx+3*8],r13
+	shrd r8,r9,52
+	xor r9,r9
+	
+	mov rax,rsi    		; get t9
+	mul rdi
+	add rax,r14    		; +t4
+	adc rdx,0
+	mov r14,0FFFFFFFFFFFFh	; !!!
+	add r8,rax		; +c
+	adc r9,rdx
+	and r14,r8
+	mov [rbx+4*8],r14
+	shrd r8,r9,48
+	xor r9,r9
+	
+	mov rax,01000003D1h	
+	mul r8		
+	add rax,r10
+	adc rdx,0
+	mov r10,0FFFFFFFFFFFFFh ; modulus
+	mov r8,rax
+	and rax,r10
+	shrd r8,rdx,52
+	mov [rbx+0*8],rax
+	add r8,r11
+	mov [rbx+1*8],r8
+	ret
+ExSetMult	ENDP
+
+
+
+
+
+
+	;;  Register Layout:
+	;;  INPUT: 	rdi	= a.n
+	;; 	   	rsi  	= this.a
+	;;  OUTPUT:	[rsi]
+	;;  INTERNAL:	rdx:rax  = multiplication accumulator
+	;; 		r8:r9    = c
+	;; 		r10-r14  = t0-t4
+	;; 		r15	 = a.n[0]*2 / t5
+	;; 		rbx	 = a.n[1]*2 / t6
+	;; 		rcx	 = a.n[2]*2 / t7
+	;; 		rbp	 = a.n[3]*2 / t8
+	;; 		rsi	 = a.n[4] / t9
+ExSetSquare	PROC C PUBLIC USES rbx rbp r12 r13 r14 r15
+	push rsi
+	mov rsi,0FFFFFFFFFFFFFh
+	
+	;; c=a.n[0] * a.n[0]
+   	mov r15,[rdi+0*8]
+	mov r10,rsi		; modulus 
+	mov rax,r15
+	mul rax  		; rsi=b.n[0]
+	mov rbx,[rdi+1*8]	; a.n[1]
+	add r15,r15		; r15=2*a.n[0]
+	mov r8,rax
+	and r10,rax		; only need lower qword
+	shrd r8,rdx,52
+	xor r9,r9
+
+	;; c+=2*a.n[0] * a.n[1]
+	mov rax,r15
+	mul rbx 	
+	mov rcx,[rdi+2*8]	; rcx=a.n[2]
+	mov r11,rsi 		; modulus
+	add r8,rax
+	adc r9,rdx
+	and r11,r8
+	shrd r8,r9,52
+	xor r9,r9
+	
+	;; c+=2*a.n[0]*a.n[2]+a.n[1]*a.n[1]
+	mov rax,r15
+	mul rcx
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,rbx
+	mov r12,rsi		; modulus
+	mul rax
+	mov rbp,[rdi+3*8]	; rbp=a.n[3]
+	add rbx,rbx		; rbx=a.n[1]*2
+	add r8,rax
+	adc r9,rdx
+
+	and r12,r8		; only need lower dword
+	shrd r8,r9,52
+	xor r9,r9		
+
+	;; c+=2*a.n[0]*a.n[3]+2*a.n[1]*a.n[2]
+	mov rax,r15
+	mul rbp
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,rbx		; rax=2*a.n[1]
+	mov r13,rsi		; modulus
+	mul rcx
+	mov rsi,[rdi+4*8]	; rsi=a.n[4] / destroy constant
+	add r8,rax
+	adc r9,rdx
+	and r13,r8
+	shrd r8,r9,52
+	xor r9,r9		
+
+	;; c+=2*a.n[0]*a.n[4]+2*a.n[1]*a.n[3]+a.n[2]*a.n[2]
+	mov rax,r15		; last time we need 2*a.n[0]
+	mul rsi
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,rbx
+	mul rbp
+	mov r14,0FFFFFFFFFFFFFh ; modulus
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,rcx
+	mul rax
+	add rcx,rcx		; rcx=2*a.n[2]
+	add r8,rax
+	adc r9,rdx
+	and r14,r8
+	shrd r8,r9,52
+	xor r9,r9		
+
+	;; c+=2*a.n[1]*a.n[4]+2*a.n[2]*a.n[3]
+	mov rax,rbx
+	mul rsi
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,rcx
+	mul rbp
+	mov r15,0FFFFFFFFFFFFFh ; modulus
+	add r8,rax
+	adc r9,rdx
+	and r15,r8
+	shrd r8,r9,52
+	xor r9,r9		
+
+	;; c+=2*a.n[2]*a.n[4]+a.n[3]*a.n[3]
+	mov rax,rcx		; 2*a.n[2]
+	mul rsi
+	add r8,rax
+	adc r9,rdx
+
+	mov rax,rbp		; a.n[3]
+	mul rax
+	mov rbx,0FFFFFFFFFFFFFh ; modulus
+	add r8,rax
+	adc r9,rdx
+	and rbx,r8		; only need lower dword
+	lea rax,[2*rbp]
+	shrd r8,r9,52
+	xor r9,r9		
+
+	;; c+=2*a.n[3]*a.n[4]
+	mul rsi
+	mov rcx,0FFFFFFFFFFFFFh ; modulus
+	add r8,rax
+	adc r9,rdx
+	and rcx,r8		; only need lower dword
+	shrd r8,r9,52
+	xor r9,r9		
+
+	;; c+=a.n[4]*a.n[4]
+	mov rax,rsi
+	mul rax
+	mov rbp,0FFFFFFFFFFFFFh ; modulus
+	add r8,rax
+	adc r9,rdx
+	and rbp,r8 
+	shrd r8,r9,52
+	xor r9,r9		
+
+	mov rsi,r8
+
+	;; *******************************************************
+	jmp common_exit_norm
+ExSetSquare	ENDP
+	end
+
+	
--- a/lin64.h
+++ b/lin64.h
@ -0,0 +1,10 @@
+#ifndef _SECP256K1_LIN64
+#define _SECP256K1_LIN64
+
+using namespace std;
+
+#ifdef INLINE_ASM
+extern "C" void _ExSetMult(uint64_t *, uint64_t *, uint64_t *);
+extern "C" void _ExSetSquare(uint64_t *, uint64_t *);
+#endif
+#endif