diff --git a/lin64.asm b/lin64.asm index 79083bd7d2..7d320dc130 100644 --- a/lin64.asm +++ b/lin64.asm @@ -1,41 +1,60 @@ - .x64 + ;; Added by Diederik Huys, March 2013 + ;; + ;; Provided public procedures: + ;; ExSetMult + ;; ExSetSquare + ;; + ;; Needed tools: JWASM (http://www.japheth.de/JWasm.html) + ;; + ;; !!! WARNING !!! !!! WARNING !!! !!! WARNING !!! + ;; + ;; Please note that recompiling this binary (jwasm) under a 64-bit OS + ;; may yield unexpected results and create a corrupted ELF64 header. + ;; + ;; + + .x64 QTEST EQU 1 .code + ;; Register Layout: ;; INPUT: rdi = a.n ;; rsi = b.n ;; rdx = this.a - ;; OUTPUT: [rbx] + ;; ;; INTERNAL: rdx:rax = multiplication accumulator - ;; rsi = b.n / t9 - ;; r8:r9 = c - ;; r10-r15 = t0-t5 - ;; rbx = t6 - ;; rcx = t7 - ;; rbp = t8 + ;; r9:r8 = c + ;; r10-r13 = t0-t3 + ;; r14 = b.n[0] / t4 + ;; r15 = b.n[1] / t5 + ;; rbx = b.n[2] / t6 + ;; rcx = b.n[3] / t7 + ;; rbp = Constant 0FFFFFFFFFFFFFh / t8 + ;; rsi = b.n / b.n[4] / t9 ExSetMult PROC C PUBLIC USES rbx rbp r12 r13 r14 r15 push rdx - mov r14,[rsi+8*0] - + mov r14,[rsi+8*0] ; preload b.n[0]. This will be the case until + ; b.n[0] is no longer needed, then we reassign + ; r14 to t4 ;; c=a.n[0] * b.n[0] - mov rax,[rdi+0*8] + mov rax,[rdi+0*8] ; load a.n[0] mov rbp,0FFFFFFFFFFFFFh - mul r14 ; rsi=b.n[0] + mul r14 ; rdx:rax=a.n[0]*b.n[0] mov r15,[rsi+1*8] - mov r10,rbp + mov r10,rbp ; load modulus into target register for t0 mov r8,rax - and r10,rax ; only need lower qword + and r10,rax ; only need lower qword of c shrd r8,rdx,52 - xor r9,r9 + xor r9,r9 ; c < 2^64, so we ditch the HO part ;; c+=a.n[0] * b.n[1] + a.n[1] * b.n[0] mov rax,[rdi+0*8] - mul r15 ; b.n[1] + mul r15 add r8,rax adc r9,rdx mov rax,[rdi+1*8] - mul r14 ; b.n[0] + mul r14 mov r11,rbp mov rbx,[rsi+2*8] add r8,rax @@ -46,44 +65,44 @@ ExSetMult PROC C PUBLIC USES rbx rbp r12 r13 r14 r15 ;; c+=a.n[0 1 2] * b.n[2 1 0] mov rax,[rdi+0*8] - mul rbx ; b.n[2] + mul rbx add r8,rax adc r9,rdx mov rax,[rdi+1*8] - mul r15 ; b.n[1] + mul r15 add r8,rax adc r9,rdx mov rax,[rdi+2*8] mul r14 - mov r12,rbp ; modulus + mov r12,rbp mov rcx,[rsi+3*8] add r8,rax adc r9,rdx - and r12,r8 ; only need lower dword + and r12,r8 shrd r8,r9,52 xor r9,r9 ;; c+=a.n[0 1 2 3] * b.n[3 2 1 0] mov rax,[rdi+0*8] - mul rcx ; b.n[3] + mul rcx add r8,rax adc r9,rdx mov rax,[rdi+1*8] - mul rbx ; b.n[2] + mul rbx add r8,rax adc r9,rdx mov rax,[rdi+2*8] - mul r15 ; b.n[1] + mul r15 add r8,rax adc r9,rdx mov rax,[rdi+3*8] - mul r14 ; b.n[0] - mov r13,rbp ; modulus + mul r14 + mov r13,rbp mov rsi,[rsi+4*8] ; load b.n[4] and destroy pointer add r8,rax adc r9,rdx @@ -105,18 +124,18 @@ ExSetMult PROC C PUBLIC USES rbx rbp r12 r13 r14 r15 adc r9,rdx mov rax,[rdi+2*8] - mul rbx ; b.n[2] + mul rbx add r8,rax adc r9,rdx mov rax,[rdi+3*8] - mul r15 ; b.n[1] + mul r15 add r8,rax adc r9,rdx mov rax,[rdi+4*8] - mul r14 ; b.n[0] - mov r14,rbp ; modulus + mul r14 + mov r14,rbp ; load modulus into t4 and destroy a.n[0] add r8,rax adc r9,rdx and r14,r8 @@ -141,7 +160,7 @@ ExSetMult PROC C PUBLIC USES rbx rbp r12 r13 r14 r15 mov rax,[rdi+4*8] mul r15 - mov r15,rbp ; modulus + mov r15,rbp add r8,rax adc r9,rdx @@ -162,11 +181,11 @@ ExSetMult PROC C PUBLIC USES rbx rbp r12 r13 r14 r15 mov rax,[rdi+4*8] mul rbx - mov rbx,rbp ; modulus + mov rbx,rbp add r8,rax adc r9,rdx - and rbx,r8 ; only need lower dword + and rbx,r8 shrd r8,r9,52 xor r9,r9 @@ -178,10 +197,10 @@ ExSetMult PROC C PUBLIC USES rbx rbp r12 r13 r14 r15 mov rax,[rdi+4*8] mul rcx - mov rcx,rbp ; modulus + mov rcx,rbp add r8,rax adc r9,rdx - and rcx,r8 ; only need lower dword + and rcx,r8 shrd r8,r9,52 xor r9,r9 @@ -195,17 +214,17 @@ ExSetMult PROC C PUBLIC USES rbx rbp r12 r13 r14 r15 shrd r8,r9,52 xor r9,r9 - mov rsi,r8 + mov rsi,r8 ; load c into t9 and destroy b.n[4] ;; ******************************************************* common_exit_norm:: - mov rdi,01000003D10h + mov rdi,01000003D10h ; load constant mov rax,r15 ; get t5 mul rdi add rax,r10 ; +t0 adc rdx,0 - mov r10,0FFFFFFFFFFFFFh ; modulus + mov r10,0FFFFFFFFFFFFFh ; modulus. Sadly, we ran out of registers! mov r8,rax ; +c and r10,rax shrd r8,rdx,52 @@ -226,12 +245,12 @@ common_exit_norm:: mul rdi add rax,r12 ; +t2 adc rdx,0 - pop rbx ; retrieve pointer to this.a.n + pop rbx ; retrieve pointer to this.n mov r12,0FFFFFFFFFFFFFh ; modulus add r8,rax ; +c adc r9,rdx and r12,r8 - mov [rbx+2*8],r12 + mov [rbx+2*8],r12 ; mov into this.n[2] shrd r8,r9,52 xor r9,r9 @@ -243,7 +262,7 @@ common_exit_norm:: add r8,rax ; +c adc r9,rdx and r13,r8 - mov [rbx+3*8],r13 + mov [rbx+3*8],r13 ; -> this.n[3] shrd r8,r9,52 xor r9,r9 @@ -255,11 +274,11 @@ common_exit_norm:: add r8,rax ; +c adc r9,rdx and r14,r8 - mov [rbx+4*8],r14 - shrd r8,r9,48 + mov [rbx+4*8],r14 ; -> this.n[4] + shrd r8,r9,48 ; !!! xor r9,r9 - mov rax,01000003D1h + mov rax,01000003D1h mul r8 add rax,r10 adc rdx,0 @@ -267,50 +286,46 @@ common_exit_norm:: mov r8,rax and rax,r10 shrd r8,rdx,52 - mov [rbx+0*8],rax + mov [rbx+0*8],rax ; -> this.n[0] add r8,r11 - mov [rbx+1*8],r8 + mov [rbx+1*8],r8 ; -> this.n[1] ret ExSetMult ENDP - - - - ;; Register Layout: - ;; INPUT: rdi = a.n - ;; rsi = this.a - ;; OUTPUT: [rsi] + ;; INPUT: rdi = a.n + ;; rsi = this.a ;; INTERNAL: rdx:rax = multiplication accumulator - ;; r8:r9 = c - ;; r10-r14 = t0-t4 - ;; r15 = a.n[0]*2 / t5 - ;; rbx = a.n[1]*2 / t6 - ;; rcx = a.n[2]*2 / t7 - ;; rbp = a.n[3]*2 / t8 - ;; rsi = a.n[4] / t9 + ;; r9:r8 = c + ;; r10-r13 = t0-t3 + ;; r14 = a.n[0] / t4 + ;; r15 = a.n[1] / t5 + ;; rbx = a.n[2] / t6 + ;; rcx = a.n[3] / t7 + ;; rbp = 0FFFFFFFFFFFFFh / t8 + ;; rsi = a.n[4] / a.n[4] /t9 ExSetSquare PROC C PUBLIC USES rbx rbp r12 r13 r14 r15 push rsi - mov rsi,0FFFFFFFFFFFFFh + mov rbp,0FFFFFFFFFFFFFh ;; c=a.n[0] * a.n[0] - mov r15,[rdi+0*8] - mov r10,rsi ; modulus - mov rax,r15 - mul rax ; rsi=b.n[0] - mov rbx,[rdi+1*8] ; a.n[1] - add r15,r15 ; r15=2*a.n[0] + mov r14,[rdi+0*8] ; r14=a.n[0] + mov r10,rbp ; modulus + mov rax,r14 + mul rax + mov r15,[rdi+1*8] ; a.n[1] + add r14,r14 ; r14=2*a.n[0] mov r8,rax and r10,rax ; only need lower qword shrd r8,rdx,52 xor r9,r9 ;; c+=2*a.n[0] * a.n[1] - mov rax,r15 - mul rbx - mov rcx,[rdi+2*8] ; rcx=a.n[2] - mov r11,rsi ; modulus + mov rax,r14 ; r14=2*a.n[0] + mul r15 + mov rbx,[rdi+2*8] ; rbx=a.n[2] + mov r11,rbp ; modulus add r8,rax adc r9,rdx and r11,r8 @@ -318,33 +333,32 @@ ExSetSquare PROC C PUBLIC USES rbx rbp r12 r13 r14 r15 xor r9,r9 ;; c+=2*a.n[0]*a.n[2]+a.n[1]*a.n[1] + mov rax,r14 + mul rbx + add r8,rax + adc r9,rdx + mov rax,r15 - mul rcx - add r8,rax - adc r9,rdx - - mov rax,rbx - mov r12,rsi ; modulus + mov r12,rbp ; modulus mul rax - mov rbp,[rdi+3*8] ; rbp=a.n[3] - add rbx,rbx ; rbx=a.n[1]*2 + mov rcx,[rdi+3*8] ; rcx=a.n[3] + add r15,r15 ; r15=a.n[1]*2 add r8,rax adc r9,rdx - and r12,r8 ; only need lower dword shrd r8,r9,52 xor r9,r9 ;; c+=2*a.n[0]*a.n[3]+2*a.n[1]*a.n[2] - mov rax,r15 - mul rbp + mov rax,r14 + mul rcx add r8,rax adc r9,rdx - mov rax,rbx ; rax=2*a.n[1] - mov r13,rsi ; modulus - mul rcx - mov rsi,[rdi+4*8] ; rsi=a.n[4] / destroy constant + mov rax,r15 ; rax=2*a.n[1] + mov r13,rbp ; modulus + mul rbx + mov rsi,[rdi+4*8] ; rsi=a.n[4] add r8,rax adc r9,rdx and r13,r8 @@ -352,20 +366,20 @@ ExSetSquare PROC C PUBLIC USES rbx rbp r12 r13 r14 r15 xor r9,r9 ;; c+=2*a.n[0]*a.n[4]+2*a.n[1]*a.n[3]+a.n[2]*a.n[2] - mov rax,r15 ; last time we need 2*a.n[0] + mov rax,r14 ; last time we need 2*a.n[0] mul rsi add r8,rax adc r9,rdx - mov rax,rbx - mul rbp - mov r14,0FFFFFFFFFFFFFh ; modulus + mov rax,r15 + mul rcx + mov r14,rbp ; modulus add r8,rax adc r9,rdx - mov rax,rcx + mov rax,rbx mul rax - add rcx,rcx ; rcx=2*a.n[2] + add rbx,rbx ; rcx=2*a.n[2] add r8,rax adc r9,rdx and r14,r8 @@ -373,14 +387,14 @@ ExSetSquare PROC C PUBLIC USES rbx rbp r12 r13 r14 r15 xor r9,r9 ;; c+=2*a.n[1]*a.n[4]+2*a.n[2]*a.n[3] - mov rax,rbx + mov rax,r15 ; last time we need 2*a.n[1] mul rsi add r8,rax adc r9,rdx - mov rax,rcx - mul rbp - mov r15,0FFFFFFFFFFFFFh ; modulus + mov rax,rbx + mul rcx + mov r15,rbp ; modulus add r8,rax adc r9,rdx and r15,r8 @@ -388,24 +402,24 @@ ExSetSquare PROC C PUBLIC USES rbx rbp r12 r13 r14 r15 xor r9,r9 ;; c+=2*a.n[2]*a.n[4]+a.n[3]*a.n[3] - mov rax,rcx ; 2*a.n[2] + mov rax,rbx ; last time we need 2*a.n[2] mul rsi add r8,rax adc r9,rdx - mov rax,rbp ; a.n[3] + mov rax,rcx ; a.n[3] mul rax - mov rbx,0FFFFFFFFFFFFFh ; modulus + mov rbx,rbp ; modulus add r8,rax adc r9,rdx and rbx,r8 ; only need lower dword - lea rax,[2*rbp] + lea rax,[2*rcx] shrd r8,r9,52 xor r9,r9 ;; c+=2*a.n[3]*a.n[4] mul rsi - mov rcx,0FFFFFFFFFFFFFh ; modulus + mov rcx,rbp ; modulus add r8,rax adc r9,rdx and rcx,r8 ; only need lower dword @@ -415,7 +429,7 @@ ExSetSquare PROC C PUBLIC USES rbx rbp r12 r13 r14 r15 ;; c+=a.n[4]*a.n[4] mov rax,rsi mul rax - mov rbp,0FFFFFFFFFFFFFh ; modulus + ;; mov rbp,rbp ; modulus is already there! add r8,rax adc r9,rdx and rbp,r8