From dd804adeff7f2297e0ca68f242e5412aaccb752e Mon Sep 17 00:00:00 2001 From: Diederik Huys Date: Sun, 16 Jun 2013 22:46:43 +0200 Subject: [PATCH] 1st assembly version of field 5x64 code --- Makefile | 19 ++- bench_all | 14 ++ configure | 11 +- src/field_5x52_asm.asm | 2 +- src/field_5x64_asm.asm | 333 ++++++++++++++++++++++++++++++++++++++ src/impl/field_5x64.h | 7 + src/impl/field_5x64_asm.h | 11 ++ 7 files changed, 386 insertions(+), 11 deletions(-) create mode 100644 bench_all create mode 100644 src/field_5x64_asm.asm create mode 100644 src/impl/field_5x64_asm.h diff --git a/Makefile b/Makefile index e9a16093fd..2dc9184c7d 100644 --- a/Makefile +++ b/Makefile @@ -8,11 +8,11 @@ JAVA_FILES := src/java/org_bitcoin_NativeSecp256k1.h src/java/org_bitcoin_Native OBJS := ifeq ($(USE_ASM), 1) - OBJS := $(OBJS) obj/field_5x52_asm.o + OBJS := $(OBJS) obj/field_5x$(HAVE_LIMB)_asm.o endif +STD="gnu99" default: tests libsecp256k1.a libsecp256k1.so - ./tests clean: rm -rf obj/*.o bench tests *.a *.so config.mk @@ -20,18 +20,21 @@ clean: obj/field_5x52_asm.o: src/field_5x52_asm.asm $(YASM) -f elf64 -o obj/field_5x52_asm.o src/field_5x52_asm.asm +obj/field_5x64_asm.o: src/field_5x64_asm.asm + $(YASM) -f elf64 -o obj/field_5x64_asm.o src/field_5x64_asm.asm + obj/secp256k1.o: $(FILES) src/secp256k1.c include/secp256k1.h - $(CC) -fPIC -std=c99 $(CFLAGS) $(CFLAGS_EXTRA) -DNDEBUG -O2 src/secp256k1.c -c -o obj/secp256k1.o + $(CC) -fPIC -std=$(STD) $(CFLAGS) $(CFLAGS_EXTRA) -DNDEBUG -$(OPTLEVEL) src/secp256k1.c -c -o obj/secp256k1.o bench: $(FILES) src/bench.c $(OBJS) - $(CC) -fPIC -std=c99 $(CFLAGS) $(CFLAGS_EXTRA) $(CFLAGS_TEST_EXTRA) -DNDEBUG -O2 src/bench.c $(OBJS) $(LDFLAGS_EXTRA) $(LDFLAGS_TEST_EXTRA) -o bench + $(CC) -fPIC -std=$(STD) $(CFLAGS) $(CFLAGS_EXTRA) $(CFLAGS_TEST_EXTRA) -DNDEBUG -$(OPTLEVEL) src/bench.c $(OBJS) $(LDFLAGS_EXTRA) $(LDFLAGS_TEST_EXTRA) -o bench tests: $(FILES) src/tests.c $(OBJS) - $(CC) -std=c99 $(CFLAGS) $(CFLAGS_EXTRA) $(CFLAGS_TEST_EXTRA) -DVERIFY -fstack-protector-all -O2 -ggdb3 src/tests.c $(OBJS) $(LDFLAGS_EXTRA) $(LDFLAGS_TEST_EXTRA) -o tests + $(CC) -std=$(STD) $(CFLAGS) $(CFLAGS_EXTRA) $(CFLAGS_TEST_EXTRA) -DVERIFY -fstack-protector-all -$(OPTLEVEL) -ggdb3 src/tests.c $(OBJS) $(LDFLAGS_EXTRA) $(LDFLAGS_TEST_EXTRA) -o tests coverage: $(FILES) src/tests.c $(OBJS) rm -rf tests.gcno tests.gcda tests_cov - $(CC) -std=c99 $(CFLAGS) $(CFLAGS_EXTRA) $(CFLAGS_TEST_EXTRA) -DVERIFY --coverage -O0 -g src/tests.c $(OBJS) $(LDFLAGS_EXTRA) $(LDFLAGS_TEST_EXTRA) -o tests_cov + $(CC) -std=$(STD) $(CFLAGS) $(CFLAGS_EXTRA) $(CFLAGS_TEST_EXTRA) -DVERIFY --coverage -$(OPTLEVEL) -g src/tests.c $(OBJS) $(LDFLAGS_EXTRA) $(LDFLAGS_TEST_EXTRA) -o tests_cov rm -rf lcov mkdir -p lcov cd lcov; lcov --directory ../ --zerocounters @@ -43,7 +46,7 @@ libsecp256k1.a: obj/secp256k1.o $(OBJS) $(AR) -rs $@ $(OBJS) obj/secp256k1.o libsecp256k1.so: obj/secp256k1.o $(OBJS) - $(CC) -std=c99 $(LDFLAGS_EXTRA) $(OBJS) obj/secp256k1.o -shared -o libsecp256k1.so + $(CC) -std=$(STD) $(LDFLAGS_EXTRA) $(OBJS) obj/secp256k1.o -shared -o libsecp256k1.so libjavasecp256k1.so: $(OBJS) obj/secp256k1.o $(JAVA_FILES) - $(CC) -fPIC -std=c99 $(CFLAGS) $(CFLAGS_EXTRA) -DNDEBUG -O2 -I. src/java/org_bitcoin_NativeSecp256k1.c $(LDFLAGS_EXTRA) $(OBJS) obj/secp256k1.o -shared -o libjavasecp256k1.so + $(CC) -fPIC -std=$(STD) $(CFLAGS) $(CFLAGS_EXTRA) -DNDEBUG -$(OPTLEVEL) -I. src/java/org_bitcoin_NativeSecp256k1.c $(LDFLAGS_EXTRA) $(OBJS) obj/secp256k1.o -shared -o libjavasecp256k1.so diff --git a/bench_all b/bench_all new file mode 100644 index 0000000000..89c747d5b7 --- /dev/null +++ b/bench_all @@ -0,0 +1,14 @@ +#!/bin/bash +echo "Benchmark Results" >output.txt +for j in no-yasm yasm; do + echo "5x64 $j:" >>output.txt + for i in O0 O1 O2 O3; do + make clean + ./configure --$j + echo "OPTLEVEL=$i" >>config.mk + make bench + echo "OPTLEVEL=$i" >>output.txt + (time ./bench) |& grep real >>output.txt + done +done + diff --git a/configure b/configure index b7ac608e6c..d68ae5bd0d 100755 --- a/configure +++ b/configure @@ -97,6 +97,9 @@ if [ "$?" = 0 ]; then HAVE_INT128=1 fi +#default limb size +HAVE_LIMB=52 + for arg in "$@"; do case "$arg" in --no-yasm) @@ -107,6 +110,9 @@ for arg in "$@"; do ;; --no-openssl) HAVE_OPENSSL=0 + ;; + --use-5x64) + HAVE_LIMB=64 ;; esac done @@ -117,10 +123,10 @@ USE_ASM=0 # select field implementation if [ "$HAVE_YASM" = "1" ]; then - CFLAGS_FIELD="-DUSE_FIELD_5X52 -DUSE_FIELD_5X52_ASM" + CFLAGS_FIELD="-DUSE_FIELD_5X$HAVE_LIMB -DUSE_FIELD_5X${HAVE_LIMB}_ASM" USE_ASM=1 elif [ "$HAVE_INT128" = "1" ]; then - CFLAGS_FIELD="-DUSE_FIELD_5X52 -DUSE_FIELD_5X52_INT128" + CFLAGS_FIELD="-DUSE_FIELD_5X$HAVE_LIMB -DUSE_FIELD_5X${HAVE_LIMB}_INT128" elif [ "$HAVE_GMP" = "1" ]; then CFLAGS_FIELD="-DUSE_FIELD_GMP" LINK_GMP=1 @@ -165,3 +171,4 @@ echo "CFLAGS_TEST_EXTRA=$CFLAGS_TEST_EXTRA" >> config.mk echo "LDFLAGS_EXTRA=$LDFLAGS_EXTRA" >> config.mk echo "LDFLAGS_TEST_EXTRA=$LDFLAGS_TEST_EXTRA" >> config.mk echo "USE_ASM=$USE_ASM" >>config.mk +echo "HAVE_LIMB=$HAVE_LIMB" >>config.mk diff --git a/src/field_5x52_asm.asm b/src/field_5x52_asm.asm index ef1c1c9b52..9237b3687d 100644 --- a/src/field_5x52_asm.asm +++ b/src/field_5x52_asm.asm @@ -314,7 +314,7 @@ common_exit_norm: ;; rbx = a.n[2] / t6 ;; rcx = a.n[3] / t7 ;; rbp = 0FFFFFFFFFFFFFh / t8 - ;; rsi = a.n[4] / a.n[4] /t9 + ;; rsi = a.n[4] / t9 GLOBAL secp256k1_fe_sqr_inner ALIGN 32 secp256k1_fe_sqr_inner: diff --git a/src/field_5x64_asm.asm b/src/field_5x64_asm.asm new file mode 100644 index 0000000000..14b0a520c6 --- /dev/null +++ b/src/field_5x64_asm.asm @@ -0,0 +1,333 @@ + ;; Added by Diederik Huys, March 2013 + ;; + ;; Provided public procedures: + ;; secp256k1_fe_mul_inner + ;; secp256k1_fe_sqr_inner + ;; + ;; Needed tools: YASM (http://yasm.tortall.net) + ;; + ;; + + BITS 64 + +COMP_LIMB EQU 000000001000003D1h + + ;; Procedure ExSetMult + ;; Register Layout: + ;; INPUT: rdi = a->n + ;; rsi = b->n + ;; rdx = r->a + ;; + ;; INTERNAL: rdx:rax = multiplication accumulator + ;; r8-r10 = c0-c2 + ;; r11-r15 = b.n[0]-b.n[4] / r3 - r7 + ;; rbx = r0 + ;; rcx = r1 + ;; rbp = r2 + ;; + GLOBAL secp256k1_fe_mul_inner + ALIGN 32 +secp256k1_fe_mul_inner: + push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + push rdx + + mov r11,[rsi+8*0] ; preload b.n[0] + + ;; step 1: mul_c2 + mov rax,[rdi+0*8] ; load a.n[0] + mul r11 ; rdx:rax=a.n[0]*b.n[0] + mov r12,[rsi+1*8] ; preload b.n[1] + mov rbx,rax ; retire LO qword (r[0]) + mov r8,rdx ; save overflow + xor r9,r9 ; overflow HO qwords + xor r10,r10 + + ;; c+=a.n[0] * b.n[1] + a.n[1] * b.n[0] + mov rax,[rdi+0*8] + mul r12 + mov r13,[rsi+2*8] ; preload b.n[2] + add r8,rax ; still the same :-) + adc r9,rdx ; + adc r10,0 ; mmm... + + mov rax,[rdi+1*8] + mul r11 + add r8,rax + adc r9,rdx + adc r10,0 + mov rcx,r8 ; retire r[1] + xor r8,r8 + + ;; c+=a.n[0 1 2] * b.n[2 1 0] + mov rax,[rdi+0*8] + mul r13 + mov r14,[rsi+3*8] ; preload b.n[3] + add r9,rax + adc r10,rdx + adc r8,0 + + mov rax,[rdi+1*8] + mul r12 + add r9,rax + adc r10,rdx + adc r8,0 + + mov rax,[rdi+2*8] + mul r11 + add r9,rax + adc r10,rdx + adc r8,0 + mov rbp,r9 ; retire r[2] + xor r9,r9 + + ;; c+=a.n[0 1 2 3] * b.n[3 2 1 0] + mov rax,[rdi+0*8] + mul r14 + add r10,rax + adc r8,rdx + adc r9,0 + + mov rax,[rdi+1*8] + mul r13 + add r10,rax + adc r8,rdx + adc r9,0 + + mov rax,[rdi+2*8] + mul r12 + add r10,rax + adc r8,rdx + adc r9,0 + + mov rax,[rdi+3*8] + mul r11 + add r10,rax + adc r8,rdx + adc r9,0 + mov r11,r10 ; retire r[3] + xor r10,r10 + + ;; c+=a.n[1 2 3] * b.n[3 2 1] + mov rax,[rdi+1*8] + mul r14 + add r8,rax + adc r9,rdx + adc r10,0 + + mov rax,[rdi+2*8] + mul r13 + add r8,rax + adc r9,rdx + adc r10,0 + + mov rax,[rdi+3*8] + mul r12 + add r8,rax + adc r9,rdx + adc r10,0 + mov r12,r8 ; retire r[4] + xor r8,r8 + + ;; c+=a.n[2 3] * b.n[3 2] + mov rax,[rdi+2*8] + mul r14 + add r9,rax ; still the same :-) + adc r10,rdx ; + adc r8,0 ; mmm... + + mov rax,[rdi+3*8] + mul r13 + add r9,rax + adc r10,rdx + adc r8,0 + mov r13,r9 ; retire r[5] + xor r9,r9 + + ;; c+=a.n[3] * b.n[3] + mov rax,[rdi+3*8] + mul r14 + add r10,rax + adc r8,rdx + adc r9,0 + mov r14,r10 + mov r15,r8 + + + ;; ******************************************************* +common_exit_norm: + mov rdi,COMP_LIMB + mov rax,r12 + mul rdi + add rax,rbx + adc rcx,rdx + pop rbx + mov [rbx],rax + + mov rax,r13 ; get r5 + mul rdi + add rax,rcx ; +r1 + adc rbp,rdx + mov [rbx+1*8],rax + + mov rax,r14 ; get r6 + mul rdi + add rax,rbp ; +r2 + adc r11,rdx + mov [rbx+2*8],rax + + mov rax,r15 ; get r7 + mul rdi + add rax,r11 ; +r3 + adc rdx,0 + mov [rbx+3*8],rax + mov [rbx+4*8],rdx + + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + pop rbp + ret + + + ;; PROC ExSetSquare + ;; Register Layout: + ;; INPUT: rdi = a.n + ;; rsi = this.a + ;; INTERNAL: rdx:rax = multiplication accumulator + ;; r8-r10 = c + ;; r11-r15 = a.n[0]-a.n[4] / r3-r7 + ;; rbx = r0 + ;; rcx = r1 + ;; rbp = r2 + GLOBAL secp256k1_fe_sqr_inner + + ALIGN 32 +secp256k1_fe_sqr_inner: + push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + push rdx + + mov r11,[rdi+8*0] ; preload a.n[0] + + ;; step 1: mul_c2 + mov rax,r11 ; load a.n[0] + mul rax ; rdx:rax=a.n[0]² + mov r12,[rdi+1*8] ; preload a.n[1] + mov rbx,rax ; retire LO qword (r[0]) + mov r8,rdx ; save overflow + xor r9,r9 ; overflow HO qwords + xor r10,r10 + + ;; c+=2*a.n[0] * a.n[1] + mov rax,r11 ; load a.n[0] + mul r12 ; rdx:rax=a.n[0] * a.n[1] + mov r13,[rdi+2*8] ; preload a.n[2] + add rax,rax ; rdx:rax*=2 + adc rdx,rdx + adc r10,0 + add r8,rax ; still the same :-) + adc r9,rdx ; + adc r10,0 ; mmm... + + mov rcx,r8 ; retire r[1] + xor r8,r8 + + ;; c+=2*a.n[0]*a.n[2]+a.n[1]*a.n[1] + mov rax,r11 ; load a.n[0] + mul r13 ; * a.n[2] + mov r14,[rdi+3*8] ; preload a.n[3] + add rax,rax ; rdx:rax*=2 + adc rdx,rdx + adc r8,0 + add r9,rax + adc r10,rdx + adc r8,0 + + mov rax,r12 + mul rax + add r9,rax + adc r10,rdx + adc r8,0 + + + mov rbp,r9 + xor r9,r9 + + ;; c+=2*a.n[0]*a.n[3]+2*a.n[1]*a.n[2] + mov rax,r11 ; load a.n[0] + mul r14 ; * a.n[3] + add rax,rax ; rdx:rax*=2 + adc rdx,rdx + adc r9,0 + add r10,rax + adc r8,rdx + adc r9,0 + + mov rax,r12 ; load a.n[1] + mul r13 ; * a.n[2] + add rax,rax + adc rdx,rdx + adc r9,0 + add r10,rax + adc r8,rdx + adc r9,0 + + mov r11,r10 + xor r10,r10 + + ;; c+=2*a.n[1]*a.n[3]+a.n[2]*a.n[2] + mov rax,r12 ; load a.n[1] + mul r14 ; * a.n[3] + add rax,rax ; rdx:rax*=2 + adc rdx,rdx + adc r10,0 + add r8,rax + adc r9,rdx + adc r10,0 + + mov rax,r13 + mul rax + add r8,rax + adc r9,rdx + adc r10,0 + + mov r12,r8 + xor r8,r8 + ;; c+=2*a.n[2]*a.n[3] + mov rax,r13 ; load a.n[2] + mul r14 ; * a.n[3] + add rax,rax ; rdx:rax*=2 + adc rdx,rdx + adc r8,0 + add r9,rax + adc r10,rdx + adc r8,0 + + mov r13,r9 + xor r13,r13 + + ;; c+=a.n[3]² + mov rax,r14 + mul rax + add r10,rax + adc r8,rdx + adc r9,0 + + mov r14,r10 + mov r15,r8 + + jmp common_exit_norm + end + + diff --git a/src/impl/field_5x64.h b/src/impl/field_5x64.h index ef82a9eb47..3c927fc90f 100644 --- a/src/impl/field_5x64.h +++ b/src/impl/field_5x64.h @@ -11,6 +11,7 @@ #include "../field.h" #include +#include "field_5x64_asm.h" /** Implements arithmetic modulo FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2F, * represented as 4 uint64_t's in base 2^64, and one overflow uint64_t. @@ -264,9 +265,14 @@ void static inline secp256k1_fe_add(secp256k1_fe_t *r, const secp256k1_fe_t *a) } void static secp256k1_fe_mul(secp256k1_fe_t *r, const secp256k1_fe_t *ac, const secp256k1_fe_t *bc) { + secp256k1_fe_t a = *ac, b = *bc; secp256k1_fe_reduce(&a); secp256k1_fe_reduce(&b); + +#ifdef USE_FIELD_5X64_ASM + secp256k1_fe_mul_inner((&a)->n,(&b)->n,r->n); +#else uint64_t c1,c2,c3; c3=0; mul_c2(a.n[0], b.n[0], c1, c2); @@ -303,6 +309,7 @@ void static secp256k1_fe_mul(secp256k1_fe_t *r, const secp256k1_fe_t *ac, const c = (unsigned __int128)r7 * COMP_LIMB + r3 + (c >> 64); r->n[3] = c; r->n[4] = c >> 64; +#endif #ifdef VERIFY r->normalized = 0; diff --git a/src/impl/field_5x64_asm.h b/src/impl/field_5x64_asm.h new file mode 100644 index 0000000000..93c6ab6b58 --- /dev/null +++ b/src/impl/field_5x64_asm.h @@ -0,0 +1,11 @@ +// Copyright (c) 2013 Pieter Wuille +// Distributed under the MIT/X11 software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#ifndef _SECP256K1_FIELD_INNER5X52_IMPL_H_ +#define _SECP256K1_FIELD_INNER5X52_IMPL_H_ + +void __attribute__ ((sysv_abi)) secp256k1_fe_mul_inner(const uint64_t *a, const uint64_t *b, uint64_t *r); +void __attribute__ ((sysv_abi)) secp256k1_fe_sqr_inner(const uint64_t *a, uint64_t *r); + +#endif