x86_64 specific assembly version of field code

This commit is contained in:
Diederik Huys 2013-03-26 00:39:22 +01:00
parent 0cd6372fe8
commit 37c4ab156d
5 changed files with 471 additions and 12 deletions

View file

@ -1,10 +1,11 @@
FLAGS_COMMON:=-Wall
FLAGS_PROD:=-DNDEBUG -g0 -O3 -march=native
FLAGS_PROD:=-DNDEBUG -O2 -march=native
FLAGS_DEBUG:=-DVERIFY_MAGNITUDE -ggdb3 -O1
FLAGS_TEST:=-DVERIFY_MAGNITUDE -ggdb3 -O3 -march=native
FLAGS_TEST:=-DVERIFY_MAGNITUDE -ggdb3 -O2 -march=native
SECP256K1_FILES := num.h field.h group.h ecmult.h ecdsa.h \
num.cpp field.cpp group.cpp ecmult.cpp ecdsa.cpp \
lin64.asm
ifndef CONF
CONF := gmp
@ -22,7 +23,7 @@ SECP256K1_FILES := $(SECP256K1_FILES) num_gmp.h num_gmp.cpp
endif
endif
all: *.cpp *.h
all: *.cpp *.asm *.h
+make CONF=openssl all-openssl
+make CONF=gmp all-gmp
@ -41,8 +42,11 @@ clean-$(CONF):
obj/secp256k1-$(CONF).o: $(SECP256K1_FILES)
$(CXX) $(FLAGS_COMMON) $(FLAGS_PROD) $(FLAGS_CONF) secp256k1.cpp -c -o obj/secp256k1-$(CONF).o
bench-$(CONF): obj/secp256k1-$(CONF).o bench.cpp
$(CXX) $(FLAGS_COMMON) $(FLAGS_PROD) $(FLAGS_CONF) obj/secp256k1-$(CONF).o bench.cpp $(LIBS) -o bench-$(CONF)
obj/lin64.o: lin64.asm
~/tmp/jwasm -Fo obj/lin64.o -elf64 lin64.asm
tests-$(CONF): $(SECP256K1_FILES) tests.cpp
$(CXX) $(FLAGS_COMMON) $(FLAGS_TEST) $(FLAGS_CONF) tests.cpp $(LIBS) -o tests-$(CONF)
bench-$(CONF): obj/secp256k1-$(CONF).o bench.cpp obj/lin64.o
$(CXX) $(FLAGS_COMMON) $(FLAGS_PROD) $(FLAGS_CONF) obj/secp256k1-$(CONF).o obj/lin64.o bench.cpp $(LIBS) -o bench-$(CONF)
tests-$(CONF): $(SECP256K1_FILES) tests.cpp obj/lin64.o
$(CXX) $(FLAGS_COMMON) $(FLAGS_TEST) $(FLAGS_CONF) obj/lin64.o tests.cpp $(LIBS) -o tests-$(CONF)

View file

@ -3,11 +3,13 @@ using namespace std;
#include <assert.h>
#include <stdint.h>
#include <string>
#define INLINE_ASM
#include "lin64.h"
#include "num.h"
#include "field.h"
#include <iostream>
// #define VERIFY_MAGNITUDE 1
using namespace std;
namespace secp256k1 {
@ -164,7 +166,11 @@ void FieldElem::SetMult(const FieldElem &a, const FieldElem &b) {
assert(a.magnitude <= 8);
assert(b.magnitude <= 8);
#endif
__int128 c = (__int128)a.n[0] * b.n[0];
#ifdef INLINE_ASM
_ExSetMult((uint64_t *) a.n,(uint64_t *) b.n, (uint64_t *) n);
#else
unsigned __int128 c = (__int128)a.n[0] * b.n[0];
uint64_t t0 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0FFFFFFFFFFFFFE0
c = c + (__int128)a.n[0] * b.n[1] +
(__int128)a.n[1] * b.n[0];
@ -199,8 +205,8 @@ void FieldElem::SetMult(const FieldElem &a, const FieldElem &b) {
c = c + (__int128)a.n[4] * b.n[4];
uint64_t t8 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 001000000000001E
uint64_t t9 = c;
c = t0 + (__int128)t5 * 0x1000003D10ULL;
c = t0 + (__int128)t5 * 0x1000003D10ULL;
t0 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0000001000003D10
c = c + t1 + (__int128)t6 * 0x1000003D10ULL;
t1 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0000001000003D10
@ -213,6 +219,7 @@ void FieldElem::SetMult(const FieldElem &a, const FieldElem &b) {
c = t0 + (__int128)c * 0x1000003D1ULL;
n[0] = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 1000008
n[1] = t1 + c;
#endif
#ifdef VERIFY_MAGNITUDE
magnitude = 1;
normalized = false;
@ -223,6 +230,10 @@ void FieldElem::SetSquare(const FieldElem &a) {
#ifdef VERIFY_MAGNITUDE
assert(a.magnitude <= 8);
#endif
#ifdef INLINE_ASM
_ExSetSquare((uint64_t *)a.n,(uint64_t *)n);
#else
__int128 c = (__int128)a.n[0] * a.n[0];
uint64_t t0 = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 0FFFFFFFFFFFFFE0
c = c + (__int128)(a.n[0]*2) * a.n[1];
@ -261,6 +272,8 @@ void FieldElem::SetSquare(const FieldElem &a) {
c = t0 + (__int128)c * 0x1000003D1ULL;
n[0] = c & 0xFFFFFFFFFFFFFULL; c = c >> 52; // c max 1000008
n[1] = t1 + c;
#endif
#ifdef VERIFY_MAGNITUDE
assert(a.magnitude <= 8);
normalized = false;

View file

@ -13,7 +13,7 @@ using namespace std;
namespace secp256k1 {
/** Implements arithmetic modulo FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2F,
* represented as 5 uint64_t's in base 2^52. The values are allowed to contain >52 each. In particular,
* represented as 5 uint64_t's in base 2^52. he values are allowed to contain >52 each. In particular,
* each FieldElem has a 'magnitude' associated with it. Internally, a magnitude M means each element
* is at most M*(2^53-1), except the most significant one, which is limited to M*(2^49-1). All operations
* accept any input with magnitude at most M, and have different rules for propagating magnitude to their

432
lin64.asm Normal file
View file

@ -0,0 +1,432 @@
.x64
QTEST EQU 1
.code
;; Register Layout:
;; INPUT: rdi = a.n
;; rsi = b.n
;; rdx = this.a
;; OUTPUT: [rbx]
;; INTERNAL: rdx:rax = multiplication accumulator
;; rsi = b.n / t9
;; r8:r9 = c
;; r10-r15 = t0-t5
;; rbx = t6
;; rcx = t7
;; rbp = t8
ExSetMult PROC C PUBLIC USES rbx rbp r12 r13 r14 r15
push rdx
mov r14,[rsi+8*0]
;; c=a.n[0] * b.n[0]
mov rax,[rdi+0*8]
mov rbp,0FFFFFFFFFFFFFh
mul r14 ; rsi=b.n[0]
mov r15,[rsi+1*8]
mov r10,rbp
mov r8,rax
and r10,rax ; only need lower qword
shrd r8,rdx,52
xor r9,r9
;; c+=a.n[0] * b.n[1] + a.n[1] * b.n[0]
mov rax,[rdi+0*8]
mul r15 ; b.n[1]
add r8,rax
adc r9,rdx
mov rax,[rdi+1*8]
mul r14 ; b.n[0]
mov r11,rbp
mov rbx,[rsi+2*8]
add r8,rax
adc r9,rdx
and r11,r8
shrd r8,r9,52
xor r9,r9
;; c+=a.n[0 1 2] * b.n[2 1 0]
mov rax,[rdi+0*8]
mul rbx ; b.n[2]
add r8,rax
adc r9,rdx
mov rax,[rdi+1*8]
mul r15 ; b.n[1]
add r8,rax
adc r9,rdx
mov rax,[rdi+2*8]
mul r14
mov r12,rbp ; modulus
mov rcx,[rsi+3*8]
add r8,rax
adc r9,rdx
and r12,r8 ; only need lower dword
shrd r8,r9,52
xor r9,r9
;; c+=a.n[0 1 2 3] * b.n[3 2 1 0]
mov rax,[rdi+0*8]
mul rcx ; b.n[3]
add r8,rax
adc r9,rdx
mov rax,[rdi+1*8]
mul rbx ; b.n[2]
add r8,rax
adc r9,rdx
mov rax,[rdi+2*8]
mul r15 ; b.n[1]
add r8,rax
adc r9,rdx
mov rax,[rdi+3*8]
mul r14 ; b.n[0]
mov r13,rbp ; modulus
mov rsi,[rsi+4*8] ; load b.n[4] and destroy pointer
add r8,rax
adc r9,rdx
and r13,r8
shrd r8,r9,52
xor r9,r9
;; c+=a.n[0 1 2 3 4] * b.n[4 3 2 1 0]
mov rax,[rdi+0*8]
mul rsi
add r8,rax
adc r9,rdx
mov rax,[rdi+1*8]
mul rcx
add r8,rax
adc r9,rdx
mov rax,[rdi+2*8]
mul rbx ; b.n[2]
add r8,rax
adc r9,rdx
mov rax,[rdi+3*8]
mul r15 ; b.n[1]
add r8,rax
adc r9,rdx
mov rax,[rdi+4*8]
mul r14 ; b.n[0]
mov r14,rbp ; modulus
add r8,rax
adc r9,rdx
and r14,r8
shrd r8,r9,52
xor r9,r9
;; c+=a.n[1 2 3 4] * b.n[4 3 2 1]
mov rax,[rdi+1*8]
mul rsi
add r8,rax
adc r9,rdx
mov rax,[rdi+2*8]
mul rcx
add r8,rax
adc r9,rdx
mov rax,[rdi+3*8]
mul rbx
add r8,rax
adc r9,rdx
mov rax,[rdi+4*8]
mul r15
mov r15,rbp ; modulus
add r8,rax
adc r9,rdx
and r15,r8
shrd r8,r9,52
xor r9,r9
;; c+=a.n[2 3 4] * b.n[4 3 2]
mov rax,[rdi+2*8]
mul rsi
add r8,rax
adc r9,rdx
mov rax,[rdi+3*8]
mul rcx
add r8,rax
adc r9,rdx
mov rax,[rdi+4*8]
mul rbx
mov rbx,rbp ; modulus
add r8,rax
adc r9,rdx
and rbx,r8 ; only need lower dword
shrd r8,r9,52
xor r9,r9
;; c+=a.n[3 4] * b.n[4 3]
mov rax,[rdi+3*8]
mul rsi
add r8,rax
adc r9,rdx
mov rax,[rdi+4*8]
mul rcx
mov rcx,rbp ; modulus
add r8,rax
adc r9,rdx
and rcx,r8 ; only need lower dword
shrd r8,r9,52
xor r9,r9
;; c+=a.n[4] * b.n[4]
mov rax,[rdi+4*8]
mul rsi
;; mov rbp,rbp ; modulus already there!
add r8,rax
adc r9,rdx
and rbp,r8
shrd r8,r9,52
xor r9,r9
mov rsi,r8
;; *******************************************************
common_exit_norm::
mov rdi,01000003D10h
mov rax,r15 ; get t5
mul rdi
add rax,r10 ; +t0
adc rdx,0
mov r10,0FFFFFFFFFFFFFh ; modulus
mov r8,rax ; +c
and r10,rax
shrd r8,rdx,52
xor r9,r9
mov rax,rbx ; get t6
mul rdi
add rax,r11 ; +t1
adc rdx,0
mov r11,0FFFFFFFFFFFFFh ; modulus
add r8,rax ; +c
adc r9,rdx
and r11,r8
shrd r8,r9,52
xor r9,r9
mov rax,rcx ; get t7
mul rdi
add rax,r12 ; +t2
adc rdx,0
pop rbx ; retrieve pointer to this.a.n
mov r12,0FFFFFFFFFFFFFh ; modulus
add r8,rax ; +c
adc r9,rdx
and r12,r8
mov [rbx+2*8],r12
shrd r8,r9,52
xor r9,r9
mov rax,rbp ; get t8
mul rdi
add rax,r13 ; +t3
adc rdx,0
mov r13,0FFFFFFFFFFFFFh ; modulus
add r8,rax ; +c
adc r9,rdx
and r13,r8
mov [rbx+3*8],r13
shrd r8,r9,52
xor r9,r9
mov rax,rsi ; get t9
mul rdi
add rax,r14 ; +t4
adc rdx,0
mov r14,0FFFFFFFFFFFFh ; !!!
add r8,rax ; +c
adc r9,rdx
and r14,r8
mov [rbx+4*8],r14
shrd r8,r9,48
xor r9,r9
mov rax,01000003D1h
mul r8
add rax,r10
adc rdx,0
mov r10,0FFFFFFFFFFFFFh ; modulus
mov r8,rax
and rax,r10
shrd r8,rdx,52
mov [rbx+0*8],rax
add r8,r11
mov [rbx+1*8],r8
ret
ExSetMult ENDP
;; Register Layout:
;; INPUT: rdi = a.n
;; rsi = this.a
;; OUTPUT: [rsi]
;; INTERNAL: rdx:rax = multiplication accumulator
;; r8:r9 = c
;; r10-r14 = t0-t4
;; r15 = a.n[0]*2 / t5
;; rbx = a.n[1]*2 / t6
;; rcx = a.n[2]*2 / t7
;; rbp = a.n[3]*2 / t8
;; rsi = a.n[4] / t9
ExSetSquare PROC C PUBLIC USES rbx rbp r12 r13 r14 r15
push rsi
mov rsi,0FFFFFFFFFFFFFh
;; c=a.n[0] * a.n[0]
mov r15,[rdi+0*8]
mov r10,rsi ; modulus
mov rax,r15
mul rax ; rsi=b.n[0]
mov rbx,[rdi+1*8] ; a.n[1]
add r15,r15 ; r15=2*a.n[0]
mov r8,rax
and r10,rax ; only need lower qword
shrd r8,rdx,52
xor r9,r9
;; c+=2*a.n[0] * a.n[1]
mov rax,r15
mul rbx
mov rcx,[rdi+2*8] ; rcx=a.n[2]
mov r11,rsi ; modulus
add r8,rax
adc r9,rdx
and r11,r8
shrd r8,r9,52
xor r9,r9
;; c+=2*a.n[0]*a.n[2]+a.n[1]*a.n[1]
mov rax,r15
mul rcx
add r8,rax
adc r9,rdx
mov rax,rbx
mov r12,rsi ; modulus
mul rax
mov rbp,[rdi+3*8] ; rbp=a.n[3]
add rbx,rbx ; rbx=a.n[1]*2
add r8,rax
adc r9,rdx
and r12,r8 ; only need lower dword
shrd r8,r9,52
xor r9,r9
;; c+=2*a.n[0]*a.n[3]+2*a.n[1]*a.n[2]
mov rax,r15
mul rbp
add r8,rax
adc r9,rdx
mov rax,rbx ; rax=2*a.n[1]
mov r13,rsi ; modulus
mul rcx
mov rsi,[rdi+4*8] ; rsi=a.n[4] / destroy constant
add r8,rax
adc r9,rdx
and r13,r8
shrd r8,r9,52
xor r9,r9
;; c+=2*a.n[0]*a.n[4]+2*a.n[1]*a.n[3]+a.n[2]*a.n[2]
mov rax,r15 ; last time we need 2*a.n[0]
mul rsi
add r8,rax
adc r9,rdx
mov rax,rbx
mul rbp
mov r14,0FFFFFFFFFFFFFh ; modulus
add r8,rax
adc r9,rdx
mov rax,rcx
mul rax
add rcx,rcx ; rcx=2*a.n[2]
add r8,rax
adc r9,rdx
and r14,r8
shrd r8,r9,52
xor r9,r9
;; c+=2*a.n[1]*a.n[4]+2*a.n[2]*a.n[3]
mov rax,rbx
mul rsi
add r8,rax
adc r9,rdx
mov rax,rcx
mul rbp
mov r15,0FFFFFFFFFFFFFh ; modulus
add r8,rax
adc r9,rdx
and r15,r8
shrd r8,r9,52
xor r9,r9
;; c+=2*a.n[2]*a.n[4]+a.n[3]*a.n[3]
mov rax,rcx ; 2*a.n[2]
mul rsi
add r8,rax
adc r9,rdx
mov rax,rbp ; a.n[3]
mul rax
mov rbx,0FFFFFFFFFFFFFh ; modulus
add r8,rax
adc r9,rdx
and rbx,r8 ; only need lower dword
lea rax,[2*rbp]
shrd r8,r9,52
xor r9,r9
;; c+=2*a.n[3]*a.n[4]
mul rsi
mov rcx,0FFFFFFFFFFFFFh ; modulus
add r8,rax
adc r9,rdx
and rcx,r8 ; only need lower dword
shrd r8,r9,52
xor r9,r9
;; c+=a.n[4]*a.n[4]
mov rax,rsi
mul rax
mov rbp,0FFFFFFFFFFFFFh ; modulus
add r8,rax
adc r9,rdx
and rbp,r8
shrd r8,r9,52
xor r9,r9
mov rsi,r8
;; *******************************************************
jmp common_exit_norm
ExSetSquare ENDP
end

10
lin64.h Normal file
View file

@ -0,0 +1,10 @@
#ifndef _SECP256K1_LIN64
#define _SECP256K1_LIN64
using namespace std;
#ifdef INLINE_ASM
extern "C" void _ExSetMult(uint64_t *, uint64_t *, uint64_t *);
extern "C" void _ExSetSquare(uint64_t *, uint64_t *);
#endif
#endif