Small fix for ExSetSquare

This commit is contained in:
Diederik Huys 2013-03-26 23:38:18 +01:00
parent af073e29e4
commit 1d8e4308dc

View file

@ -292,25 +292,25 @@ ExSetMult ENDP
;; rsi = a.n[4] / t9 ;; rsi = a.n[4] / t9
ExSetSquare PROC C PUBLIC USES rbx rbp r12 r13 r14 r15 ExSetSquare PROC C PUBLIC USES rbx rbp r12 r13 r14 r15
push rsi push rsi
mov rsi,0FFFFFFFFFFFFFh mov rbp,0FFFFFFFFFFFFFh
;; c=a.n[0] * a.n[0] ;; c=a.n[0] * a.n[0]
mov r15,[rdi+0*8] mov r14,[rdi+0*8] ; r14=a.n[0]
mov r10,rsi ; modulus mov r10,rbp ; modulus
mov rax,r15 mov rax,r14
mul rax ; rsi=b.n[0] mul rax
mov rbx,[rdi+1*8] ; a.n[1] mov r15,[rdi+1*8] ; a.n[1]
add r15,r15 ; r15=2*a.n[0] add r14,r14 ; r14=2*a.n[0]
mov r8,rax mov r8,rax
and r10,rax ; only need lower qword and r10,rax ; only need lower qword
shrd r8,rdx,52 shrd r8,rdx,52
xor r9,r9 xor r9,r9
;; c+=2*a.n[0] * a.n[1] ;; c+=2*a.n[0] * a.n[1]
mov rax,r15 mov rax,r14 ; r14=2*a.n[0]
mul rbx mul r15
mov rcx,[rdi+2*8] ; rcx=a.n[2] mov rbx,[rdi+2*8] ; rbx=a.n[2]
mov r11,rsi ; modulus mov r11,rbp ; modulus
add r8,rax add r8,rax
adc r9,rdx adc r9,rdx
and r11,r8 and r11,r8
@ -318,33 +318,32 @@ ExSetSquare PROC C PUBLIC USES rbx rbp r12 r13 r14 r15
xor r9,r9 xor r9,r9
;; c+=2*a.n[0]*a.n[2]+a.n[1]*a.n[1] ;; c+=2*a.n[0]*a.n[2]+a.n[1]*a.n[1]
mov rax,r14
mul rbx
add r8,rax
adc r9,rdx
mov rax,r15 mov rax,r15
mul rcx mov r12,rbp ; modulus
add r8,rax
adc r9,rdx
mov rax,rbx
mov r12,rsi ; modulus
mul rax mul rax
mov rbp,[rdi+3*8] ; rbp=a.n[3] mov rcx,[rdi+3*8] ; rcx=a.n[3]
add rbx,rbx ; rbx=a.n[1]*2 add r15,r15 ; r15=a.n[1]*2
add r8,rax add r8,rax
adc r9,rdx adc r9,rdx
and r12,r8 ; only need lower dword and r12,r8 ; only need lower dword
shrd r8,r9,52 shrd r8,r9,52
xor r9,r9 xor r9,r9
;; c+=2*a.n[0]*a.n[3]+2*a.n[1]*a.n[2] ;; c+=2*a.n[0]*a.n[3]+2*a.n[1]*a.n[2]
mov rax,r15 mov rax,r14
mul rbp mul rcx
add r8,rax add r8,rax
adc r9,rdx adc r9,rdx
mov rax,rbx ; rax=2*a.n[1] mov rax,r15 ; rax=2*a.n[1]
mov r13,rsi ; modulus mov r13,rbp ; modulus
mul rcx mul rbx
mov rsi,[rdi+4*8] ; rsi=a.n[4] / destroy constant mov rsi,[rdi+4*8] ; rsi=a.n[4]
add r8,rax add r8,rax
adc r9,rdx adc r9,rdx
and r13,r8 and r13,r8
@ -352,20 +351,20 @@ ExSetSquare PROC C PUBLIC USES rbx rbp r12 r13 r14 r15
xor r9,r9 xor r9,r9
;; c+=2*a.n[0]*a.n[4]+2*a.n[1]*a.n[3]+a.n[2]*a.n[2] ;; c+=2*a.n[0]*a.n[4]+2*a.n[1]*a.n[3]+a.n[2]*a.n[2]
mov rax,r15 ; last time we need 2*a.n[0] mov rax,r14 ; last time we need 2*a.n[0]
mul rsi mul rsi
add r8,rax add r8,rax
adc r9,rdx adc r9,rdx
mov rax,rbx mov rax,r15
mul rbp mul rcx
mov r14,0FFFFFFFFFFFFFh ; modulus mov r14,rbp ; modulus
add r8,rax add r8,rax
adc r9,rdx adc r9,rdx
mov rax,rcx mov rax,rbx
mul rax mul rax
add rcx,rcx ; rcx=2*a.n[2] add rbx,rbx ; rcx=2*a.n[2]
add r8,rax add r8,rax
adc r9,rdx adc r9,rdx
and r14,r8 and r14,r8
@ -373,14 +372,14 @@ ExSetSquare PROC C PUBLIC USES rbx rbp r12 r13 r14 r15
xor r9,r9 xor r9,r9
;; c+=2*a.n[1]*a.n[4]+2*a.n[2]*a.n[3] ;; c+=2*a.n[1]*a.n[4]+2*a.n[2]*a.n[3]
mov rax,rbx mov rax,r15 ; last time we need 2*a.n[1]
mul rsi mul rsi
add r8,rax add r8,rax
adc r9,rdx adc r9,rdx
mov rax,rcx mov rax,rbx
mul rbp mul rcx
mov r15,0FFFFFFFFFFFFFh ; modulus mov r15,rbp ; modulus
add r8,rax add r8,rax
adc r9,rdx adc r9,rdx
and r15,r8 and r15,r8
@ -388,24 +387,24 @@ ExSetSquare PROC C PUBLIC USES rbx rbp r12 r13 r14 r15
xor r9,r9 xor r9,r9
;; c+=2*a.n[2]*a.n[4]+a.n[3]*a.n[3] ;; c+=2*a.n[2]*a.n[4]+a.n[3]*a.n[3]
mov rax,rcx ; 2*a.n[2] mov rax,rbx ; last time we need 2*a.n[2]
mul rsi mul rsi
add r8,rax add r8,rax
adc r9,rdx adc r9,rdx
mov rax,rbp ; a.n[3] mov rax,rcx ; a.n[3]
mul rax mul rax
mov rbx,0FFFFFFFFFFFFFh ; modulus mov rbx,rbp ; modulus
add r8,rax add r8,rax
adc r9,rdx adc r9,rdx
and rbx,r8 ; only need lower dword and rbx,r8 ; only need lower dword
lea rax,[2*rbp] lea rax,[2*rcx]
shrd r8,r9,52 shrd r8,r9,52
xor r9,r9 xor r9,r9
;; c+=2*a.n[3]*a.n[4] ;; c+=2*a.n[3]*a.n[4]
mul rsi mul rsi
mov rcx,0FFFFFFFFFFFFFh ; modulus mov rcx,rbp ; modulus
add r8,rax add r8,rax
adc r9,rdx adc r9,rdx
and rcx,r8 ; only need lower dword and rcx,r8 ; only need lower dword
@ -415,7 +414,7 @@ ExSetSquare PROC C PUBLIC USES rbx rbp r12 r13 r14 r15
;; c+=a.n[4]*a.n[4] ;; c+=a.n[4]*a.n[4]
mov rax,rsi mov rax,rsi
mul rax mul rax
mov rbp,0FFFFFFFFFFFFFh ; modulus ;; mov rbp,rbp ; modulus is already there!
add r8,rax add r8,rax
adc r9,rdx adc r9,rdx
and rbp,r8 and rbp,r8