TTMath  0.9.4
 C++ bignum library
ttmathuint_x86_64_msvc.asm
Go to the documentation of this file.
1 ;
2 ; This file is a part of TTMath Bignum Library
3 ; and is distributed under the 3-Clause BSD Licence.
4 ; Author: Christian Kaiser <chk@online.de>, Tomasz Sowa <t.sowa@ttmath.org>
5 ;
6 
7 ;
8 ; Copyright (c) 2009-2017, Christian Kaiser, Tomasz Sowa
9 ; All rights reserved.
10 ;
11 ; Redistribution and use in source and binary forms, with or without
12 ; modification, are permitted provided that the following conditions are met:
13 ;
14 ; * Redistributions of source code must retain the above copyright notice,
15 ; this list of conditions and the following disclaimer.
16 ;
17 ; * Redistributions in binary form must reproduce the above copyright
18 ; notice, this list of conditions and the following disclaimer in the
19 ; documentation and/or other materials provided with the distribution.
20 ;
21 ; * Neither the name Christian Kaiser nor the names of contributors to this
22 ; project may be used to endorse or promote products derived
23 ; from this software without specific prior written permission.
24 ;
25 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
26 ; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 ; ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
29 ; LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 ; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 ; SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 ; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 ; CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 ; ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
35 ; THE POSSIBILITY OF SUCH DAMAGE.
36 ;
37 
38 ;
39 ; compile with debug info: ml64.exe /c /Zd /Zi ttmathuint_x86_64_msvc.asm
40 ; compile without debug info: ml64.exe /c ttmathuint_x86_64_msvc.asm
41 ; this creates ttmathuint_x86_64_msvc.obj file which can be linked with your program
42 ;
43 
44 ; doxygen info is put to ttmathuint_x86_64.h file
45 
46 
47 PUBLIC ttmath_adc_x64
48 PUBLIC ttmath_addindexed_x64
49 PUBLIC ttmath_addindexed2_x64
50 PUBLIC ttmath_addvector_x64
51 
52 PUBLIC ttmath_sbb_x64
53 PUBLIC ttmath_subindexed_x64
54 PUBLIC ttmath_subvector_x64
55 
56 PUBLIC ttmath_rcl_x64
57 PUBLIC ttmath_rcr_x64
58 
59 PUBLIC ttmath_rcl2_x64
60 PUBLIC ttmath_rcr2_x64
61 
62 PUBLIC ttmath_div_x64
63 
64 ;
65 ; Microsoft x86_64 convention: http://msdn.microsoft.com/en-us/library/9b372w95.aspx
66 ;
67 ; "rax, rcx, rdx, r8-r11 are volatile."
68 ; "rbx, rbp, rdi, rsi, r12-r15 are nonvolatile."
69 ;
70 
71 
72 .CODE
73 
74 
75  ALIGN 8
76 
77 ;----------------------------------------
78 
79 ttmath_adc_x64 PROC
80  ; rcx = p1
81  ; rdx = p2
82  ; r8 = nSize
83  ; r9 = nCarry
84 
85  xor rax, rax
86  xor r11, r11
87  sub rax, r9 ; sets CARRY if r9 != 0
88 
89  ALIGN 16
90  loop1:
91  mov rax,qword ptr [rdx + r11 * 8]
92  adc qword ptr [rcx + r11 * 8], rax
93  lea r11, [r11+1]
94  dec r8
95  jnz loop1
96 
97  setc al
98  movzx rax, al
99 
100  ret
101 
102 ttmath_adc_x64 ENDP
103 
104 ;----------------------------------------
105 
106  ALIGN 8
107 
108 ;----------------------------------------
109 
110 ttmath_addindexed_x64 PROC
111 
112  ; rcx = p1
113  ; rdx = nSize
114  ; r8 = nPos
115  ; r9 = nValue
116 
117  xor rax, rax ; rax = result
118  sub rdx, r8 ; rdx = remaining count of uints
119 
120  add qword ptr [rcx + r8 * 8], r9
121  jc next1
122 
123  ret
124 
125 next1:
126  mov r9, 1
127 
128  ALIGN 16
129 loop1:
130  dec rdx
131  jz done_with_cy
132  lea r8, [r8+1]
133  add qword ptr [rcx + r8 * 8], r9
134  jc loop1
135 
136  ret
137 
138 done_with_cy:
139  lea rax, [rax+1] ; rax = 1
140 
141  ret
142 
143 ttmath_addindexed_x64 ENDP
144 
145 ;----------------------------------------
146 
147  ALIGN 8
148 
149 ;----------------------------------------
150 
151 ttmath_addindexed2_x64 PROC
152 
153  ; rcx = p1 (pointer)
154  ; rdx = b (value size)
155  ; r8 = nPos
156  ; r9 = nValue1
157  ; [rsp+0x28] = nValue2
158 
159  xor rax, rax ; return value
160  mov r11, rcx ; table
161  sub rdx, r8 ; rdx = remaining count of uints
162  mov r10, [rsp+028h] ; r10 = nValue2
163 
164  add qword ptr [r11 + r8 * 8], r9
165  lea r8, [r8+1]
166  lea rdx, [rdx-1]
167  adc qword ptr [r11 + r8 * 8], r10
168  jc next
169  ret
170 
171  ALIGN 16
172 loop1:
173  lea r8, [r8+1]
174  add qword ptr [r11 + r8 * 8], 1
175  jc next
176  ret
177 
178 next:
179  dec rdx ; does not modify CY too...
180  jnz loop1
181  lea rax, [rax+1]
182  ret
183 
184 ttmath_addindexed2_x64 ENDP
185 
186 
187 
188 ;----------------------------------------
189 
190  ALIGN 8
191 
192 ;----------------------------------------
193 
194 
195 ttmath_addvector_x64 PROC
196  ; rcx = ss1
197  ; rdx = ss2
198  ; r8 = ss1_size
199  ; r9 = ss2_size
200  ; [rsp+0x28] = result
201 
202  mov r10, [rsp+028h]
203  sub r8, r9
204  xor r11, r11 ; r11=0, cf=0
205 
206  ALIGN 16
207  loop1:
208  mov rax, qword ptr [rcx + r11 * 8]
209  adc rax, qword ptr [rdx + r11 * 8]
210  mov qword ptr [r10 + r11 * 8], rax
211  inc r11
212  dec r9
213  jnz loop1
214 
215  adc r9, r9 ; r9 has the cf state
216 
217  or r8, r8
218  jz done
219 
220  neg r9 ; setting cf from r9
221  mov r9, 0 ; don't use xor here (cf is used)
222  loop2:
223  mov rax, qword ptr [rcx + r11 * 8]
224  adc rax, r9
225  mov qword ptr [r10 + r11 * 8], rax
226  inc r11
227  dec r8
228  jnz loop2
229 
230  adc r8, r8
231  mov rax, r8
232 
233  ret
234 
235 done:
236  mov rax, r9
237  ret
238 
239 ttmath_addvector_x64 ENDP
240 
241 
242 ;----------------------------------------
243 
244  ALIGN 8
245 
246 ;----------------------------------------
247 
248 ttmath_sbb_x64 PROC
249 
250  ; rcx = p1
251  ; rdx = p2
252  ; r8 = nCount
253  ; r9 = nCarry
254 
255  xor rax, rax
256  xor r11, r11
257  sub rax, r9 ; sets CARRY if r9 != 0
258 
259  ALIGN 16
260  loop1:
261  mov rax,qword ptr [rdx + r11 * 8]
262  sbb qword ptr [rcx + r11 * 8], rax
263  lea r11, [r11+1]
264  dec r8
265  jnz loop1
266 
267  setc al
268  movzx rax, al
269 
270  ret
271 
272 ttmath_sbb_x64 ENDP
273 
274 ;----------------------------------------
275 
276  ALIGN 8
277 
278 ;----------------------------------------
279 
280 ttmath_subindexed_x64 PROC
281  ; rcx = p1
282  ; rdx = nSize
283  ; r8 = nPos
284  ; r9 = nValue
285 
286  sub rdx, r8 ; rdx = remaining count of uints
287 
288  ALIGN 16
289 loop1:
290  sub qword ptr [rcx + r8 * 8], r9
291  jnc done
292 
293  lea r8, [r8+1]
294  mov r9, 1
295  dec rdx
296  jnz loop1
297 
298  mov rax, 1
299  ret
300 
301 done:
302  xor rax, rax
303  ret
304 
305 ttmath_subindexed_x64 ENDP
306 
307 
308 
309 ;----------------------------------------
310 
311  ALIGN 8
312 
313 ;----------------------------------------
314 
315 ; the same asm code as in addvector_x64 only two instructions 'adc' changed to 'sbb'
316 
317 ttmath_subvector_x64 PROC
318  ; rcx = ss1
319  ; rdx = ss2
320  ; r8 = ss1_size
321  ; r9 = ss2_size
322  ; [rsp+0x28] = result
323 
324  mov r10, [rsp+028h]
325  sub r8, r9
326  xor r11, r11 ; r11=0, cf=0
327 
328  ALIGN 16
329  loop1:
330  mov rax, qword ptr [rcx + r11 * 8]
331  sbb rax, qword ptr [rdx + r11 * 8]
332  mov qword ptr [r10 + r11 * 8], rax
333  inc r11
334  dec r9
335  jnz loop1
336 
337  adc r9, r9 ; r9 has the cf state
338 
339  or r8, r8
340  jz done
341 
342  neg r9 ; setting cf from r9
343  mov r9, 0 ; don't use xor here (cf is used)
344  loop2:
345  mov rax, qword ptr [rcx + r11 * 8]
346  sbb rax, r9
347  mov qword ptr [r10 + r11 * 8], rax
348  inc r11
349  dec r8
350  jnz loop2
351 
352  adc r8, r8
353  mov rax, r8
354 
355  ret
356 
357 done:
358  mov rax, r9
359  ret
360 
361 ttmath_subvector_x64 ENDP
362 
363 
364 
365 
366 ;----------------------------------------
367 
368  ALIGN 8
369 
370 ;----------------------------------------
371 
372 ttmath_rcl_x64 PROC
373  ; rcx = p1
374  ; rdx = b
375  ; r8 = nLowestBit
376 
377  mov r11, rcx ; table
378  xor r10, r10
379  neg r8 ; CY set if r8 <> 0
380 
381  ALIGN 16
382 loop1:
383  rcl qword ptr [r11 + r10 * 8], 1
384  lea r10, [r10+1]
385  dec rdx
386  jnz loop1
387 
388  setc al
389  movzx rax, al
390 
391  ret
392 
393 ttmath_rcl_x64 ENDP
394 
395 ;----------------------------------------
396 
397  ALIGN 8
398 
399 ;----------------------------------------
400 
401 ttmath_rcr_x64 PROC
402  ; rcx = p1
403  ; rdx = nSize
404  ; r8 = nLowestBit
405 
406  xor r10, r10
407  neg r8 ; CY set if r8 <> 0
408 
409  ALIGN 16
410 loop1:
411  rcr qword ptr -8[rcx + rdx * 8], 1
412  dec rdx
413  jnz loop1
414 
415  setc al
416  movzx rax, al
417 
418  ret
419 
420 ttmath_rcr_x64 ENDP
421 
422 ;----------------------------------------
423 
424  ALIGN 8
425 
426 ;----------------------------------------
427 
428 ttmath_div_x64 PROC
429 
430  ; rcx = &Hi
431  ; rdx = &Lo
432  ; r8 = nDiv
433 
434  mov r11, rcx
435  mov r10, rdx
436 
437  mov rdx, qword ptr [r11]
438  mov rax, qword ptr [r10]
439  div r8
440  mov qword ptr [r10], rdx ; remainder
441  mov qword ptr [r11], rax ; value
442 
443  ret
444 
445 ttmath_div_x64 ENDP
446 
447 ;----------------------------------------
448 
449  ALIGN 8
450 
451 ;----------------------------------------
452 
453 ttmath_rcl2_x64 PROC
454  ; rcx = p1
455  ; rdx = nSize
456  ; r8 = bits
457  ; r9 = c
458 
459  push rbx
460 
461  mov r10, rcx ; r10 = p1
462  xor rax, rax
463 
464  mov rcx, 64
465  sub rcx, r8
466 
467  mov r11, -1
468  shr r11, cl ; r11 = mask
469 
470  mov rcx, r8 ; rcx = count of bits
471 
472  mov rbx, rax ; rbx = old value = 0
473  or r9, r9
474  cmovnz rbx, r11 ; if (c) then old value = mask
475 
476  mov r9, rax ; r9 = index (0..nSize-1)
477 
478  ALIGN 16
479 loop1:
480  rol qword ptr [r10+r9*8], cl
481  mov rax, qword ptr [r10+r9*8]
482  and rax, r11
483  xor qword ptr [r10+r9*8], rax
484  or qword ptr [r10+r9*8], rbx
485  mov rbx, rax
486 
487  lea r9, [r9+1]
488  dec rdx
489 
490  jnz loop1
491 
492  and rax, 1
493  pop rbx
494  ret
495 
496 ttmath_rcl2_x64 ENDP
497 
498 ;----------------------------------------
499 
500  ALIGN 8
501 
502 ;----------------------------------------
503 
504 ttmath_rcr2_x64 PROC
505  ; rcx = p1
506  ; rdx = nSize
507  ; r8 = bits
508  ; r9 = c
509 
510  push rbx
511  mov r10, rcx ; r10 = p1
512  xor rax, rax
513 
514  mov rcx, 64
515  sub rcx, r8
516 
517  mov r11, -1
518  shl r11, cl ; r11 = mask
519 
520  mov rcx, r8 ; rcx = count of bits
521 
522  mov rbx, rax ; rbx = old value = 0
523  or r9, r9
524  cmovnz rbx, r11 ; if (c) then old value = mask
525 
526  mov r9, rdx ; r9 = index (0..nSize-1)
527  lea r9, [r9-1]
528 
529  ALIGN 16
530 loop1:
531  ror qword ptr [r10+r9*8], cl
532  mov rax, qword ptr [r10+r9*8]
533  and rax, r11
534  xor qword ptr [r10+r9*8], rax
535  or qword ptr [r10+r9*8], rbx
536  mov rbx, rax
537 
538  lea r9, [r9-1]
539  dec rdx
540 
541  jnz loop1
542 
543  rol rax, 1
544  and rax, 1
545  pop rbx
546 
547  ret
548 
549 ttmath_rcr2_x64 ENDP
550 
551 END