ch11_unrolled_asm... 1 .intel_mnemonic
ch11_unrolled_asm... 2 .intel_syntax noprefix
ch11_unrolled_asm... 3
ch11_unrolled_asm... 4 # unrolled comba variant
ch11_unrolled_asm... 5
ch11_unrolled_asm... 6 # Register allocation
ch11_unrolled_asm... 7 # R8 = A0
ch11_unrolled_asm... 8 # R9 = A1
ch11_unrolled_asm... 9 # R10 = A2
ch11_unrolled_asm... 10 # R11 = J,U
ch11_unrolled_asm... 11 # R12 = I in second (higher part) loop of comba
ch11_unrolled_asm... 12 # RBX = N in col = I in comba
ch11_unrolled_asm... 13 # RDI = X
ch11_unrolled_asm... 14 # RSI = Y
ch11_unrolled_asm... 15 # RCX = L at x86_64_comba_unrolled entry, XY later
ch11_unrolled_asm... 16 # RAX, RDX = (temporary)
ch11_unrolled_asm... 17
ch11_unrolled_asm... 18 .macro gen_col_inner I NIter
ch11_unrolled_asm... 19 .if \NIter - \I
ch11_unrolled_asm... 20 gen_col_inner "(\I + 1)" \NIter
ch11_unrolled_asm... 21 .endif
ch11_unrolled_asm... 22 lea rdx, [rsi + 8*rbx] # rdx := Y'Address + N*8
ch11_unrolled_asm... 23 lea rax, [8*r11] # rax := 8*j
ch11_unrolled_asm... 24 sub rdx, rax # rdx := rdx - j*8
ch11_unrolled_asm... 25 mov rdx, [rdx] # rdx := *(rdx)
ch11_unrolled_asm... 26 mov rax, [rdi + 8*r11] # rax := X(j) := *(X'Address + j*8)
ch11_unrolled_asm... 27 mul rdx # rdx:rax := rax*rdx
ch11_unrolled_asm... 28 add r8, rax # A0, C := A0 + rax
ch11_unrolled_asm... 29 adc r9, rdx # A1, C := A1 + rdx + C
ch11_unrolled_asm... 30 adc r10, 0 # A2, [C=0] := A2 + 0 + C
ch11_unrolled_asm... 31 inc r11 # J := J + 1
ch11_unrolled_asm... 32 .endm
ch11_unrolled_asm... 33
ch11_unrolled_asm... 34 .macro col_finish
ch11_unrolled_asm... 35 mov [rcx + 8*rbx], r8 # XY(N) := A0
ch11_unrolled_asm... 36 mov r8, r9 # A0 := A1
ch11_unrolled_asm... 37 mov r9, r10 # A1 := A2
ch11_unrolled_asm... 38 xor r10, r10 # A2 := 0
ch11_unrolled_asm... 39 inc rbx # N := N + 1
ch11_unrolled_asm... 40 .endm
ch11_unrolled_asm... 41
ch11_unrolled_asm... 42 .macro gen_col NIter
ch11_unrolled_asm... 43 gen_col_inner 0 \NIter
ch11_unrolled_asm... 44 col_finish
ch11_unrolled_asm... 45 .endm
ch11_unrolled_asm... 46
ch11_unrolled_asm... 47 .macro gen_loop_low L
ch11_unrolled_asm... 48 .if \L
ch11_unrolled_asm... 49 gen_loop_low "(\L-1)"
ch11_unrolled_asm... 50 xor r11, r11 # U := 0
ch11_unrolled_asm... 51 gen_col \L-1
ch11_unrolled_asm... 52 .endif
ch11_unrolled_asm... 53 .endm
ch11_unrolled_asm... 54
ch11_unrolled_asm... 55 .macro gen_loop_high_inner I L
ch11_unrolled_asm... 56 .if \L-\I
ch11_unrolled_asm... 57 inc r12 # I := I + 1
ch11_unrolled_asm... 58 mov r11, r12 # U := I (U in col)
ch11_unrolled_asm... 59 gen_col "(\L-1-\I)"
ch11_unrolled_asm... 60 gen_loop_high_inner "(\I+1)" \L
ch11_unrolled_asm... 61 .endif
ch11_unrolled_asm... 62 .endm
ch11_unrolled_asm... 63
ch11_unrolled_asm... 64 .macro gen_loop_high L
ch11_unrolled_asm... 65 gen_loop_high_inner 1 \L
ch11_unrolled_asm... 66 .endm
ch11_unrolled_asm... 67
ch11_unrolled_asm... 68 .equiv Karatsuba_Thresh, 32
ch11_unrolled_asm... 69
ch11_unrolled_asm... 70 # Arguments
ch11_unrolled_asm... 71 # RDI: X
ch11_unrolled_asm... 72 # RSI: Y
ch11_unrolled_asm... 73 # RDX: XY
ch11_unrolled_asm... 74 # RCX: L
ch11_unrolled_asm... 75 .global x86_64_comba_unrolled
ch11_unrolled_asm... 76 x86_64_comba_unrolled:
ch11_unrolled_asm... 77 push rbx
ch11_unrolled_asm... 78 push r12
ch11_unrolled_asm... 79
ch11_unrolled_asm... 80 cmp rcx, Karatsuba_Thresh
ch11_unrolled_asm... 81 jne size_fail
ch11_unrolled_asm... 82
ch11_unrolled_asm... 83 mov rcx, rdx # RCX := XY
ch11_unrolled_asm... 84 xor r12, r12 # TMP := 0
ch11_unrolled_asm... 85 xor r8, r8 # A0 := 0
ch11_unrolled_asm... 86 xor r9, r9 # A1 := 0
ch11_unrolled_asm... 87 xor r10, r10 # A2 := 0
ch11_unrolled_asm... 88 xor rbx, rbx # N := 0
ch11_unrolled_asm... 89
ch11_unrolled_asm... 90 gen_loop_low Karatsuba_Thresh
ch11_unrolled_asm... 91 gen_loop_high Karatsuba_Thresh
ch11_unrolled_asm... 92 col_finish
ch11_unrolled_asm... 93
ch11_unrolled_asm... 94 pop r12
ch11_unrolled_asm... 95 pop rbx
ch11_unrolled_asm... 96 ret
ch11_unrolled_asm... 97
ch11_unrolled_asm... 98 size_fail:
ch11_unrolled_asm... 99 ud2