ch11_unrolled_asm...    1 .intel_mnemonic
ch11_unrolled_asm...    2 .intel_syntax noprefix
ch11_unrolled_asm...    3 
ch11_unrolled_asm...    4 # unrolled comba variant
ch11_unrolled_asm...    5 
ch11_unrolled_asm...    6 # Register allocation
ch11_unrolled_asm...    7 # R8  = A0
ch11_unrolled_asm...    8 # R9  = A1
ch11_unrolled_asm...    9 # R10 = A2
ch11_unrolled_asm...   10 # R11 = J,U
ch11_unrolled_asm...   11 # R12 = I in second (higher part) loop of comba
ch11_unrolled_asm...   12 # RBX = N in col = I in comba
ch11_unrolled_asm...   13 # RDI = X
ch11_unrolled_asm...   14 # RSI = Y
ch11_unrolled_asm...   15 # RCX = L at x86_64_comba_unrolled entry, XY later
ch11_unrolled_asm...   16 # RAX, RDX = (temporary)
ch11_unrolled_asm...   17 
ch11_unrolled_asm...   18 .macro gen_col_inner I NIter
ch11_unrolled_asm...   19 .if \NIter - \I
ch11_unrolled_asm...   20 gen_col_inner "(\I + 1)" \NIter
ch11_unrolled_asm...   21 .endif
ch11_unrolled_asm...   22 lea rdx, [rsi + 8*rbx]   # rdx := Y'Address + N*8
ch11_unrolled_asm...   23 lea rax, [8*r11]         # rax := 8*j
ch11_unrolled_asm...   24 sub rdx, rax             # rdx := rdx - j*8
ch11_unrolled_asm...   25 mov rdx, [rdx]           # rdx := *(rdx)
ch11_unrolled_asm...   26 mov rax, [rdi + 8*r11]   # rax := X(j) := *(X'Address + j*8)
ch11_unrolled_asm...   27 mul rdx                  # rdx:rax := rax*rdx
ch11_unrolled_asm...   28 add r8,  rax             # A0, C := A0 + rax
ch11_unrolled_asm...   29 adc r9,  rdx             # A1, C := A1 + rdx + C
ch11_unrolled_asm...   30 adc r10, 0               # A2, [C=0] := A2 + 0 + C
ch11_unrolled_asm...   31 inc r11                  # J := J + 1
ch11_unrolled_asm...   32 .endm
ch11_unrolled_asm...   33 
ch11_unrolled_asm...   34 .macro col_finish
ch11_unrolled_asm...   35 mov [rcx + 8*rbx], r8    # XY(N) := A0
ch11_unrolled_asm...   36 mov r8, r9               # A0 := A1
ch11_unrolled_asm...   37 mov r9, r10              # A1 := A2
ch11_unrolled_asm...   38 xor r10, r10             # A2 := 0
ch11_unrolled_asm...   39 inc rbx                  # N  := N + 1
ch11_unrolled_asm...   40 .endm
ch11_unrolled_asm...   41 
ch11_unrolled_asm...   42 .macro gen_col NIter
ch11_unrolled_asm...   43 gen_col_inner 0 \NIter
ch11_unrolled_asm...   44 col_finish
ch11_unrolled_asm...   45 .endm
ch11_unrolled_asm...   46 
ch11_unrolled_asm...   47 .macro gen_loop_low L
ch11_unrolled_asm...   48 .if \L
ch11_unrolled_asm...   49 gen_loop_low "(\L-1)"
ch11_unrolled_asm...   50 xor r11, r11		# U := 0
ch11_unrolled_asm...   51 gen_col \L-1
ch11_unrolled_asm...   52 .endif
ch11_unrolled_asm...   53 .endm
ch11_unrolled_asm...   54 
ch11_unrolled_asm...   55 .macro gen_loop_high_inner I L
ch11_unrolled_asm...   56 .if \L-\I
ch11_unrolled_asm...   57 inc r12                 # I := I + 1
ch11_unrolled_asm...   58 mov r11, r12            # U := I (U in col)
ch11_unrolled_asm...   59 gen_col "(\L-1-\I)"
ch11_unrolled_asm...   60 gen_loop_high_inner "(\I+1)" \L
ch11_unrolled_asm...   61 .endif
ch11_unrolled_asm...   62 .endm
ch11_unrolled_asm...   63 
ch11_unrolled_asm...   64 .macro gen_loop_high L
ch11_unrolled_asm...   65 gen_loop_high_inner 1 \L
ch11_unrolled_asm...   66 .endm
ch11_unrolled_asm...   67 
ch11_unrolled_asm...   68 .equiv Karatsuba_Thresh, 32
ch11_unrolled_asm...   69 
ch11_unrolled_asm...   70 # Arguments
ch11_unrolled_asm...   71 # RDI: X
ch11_unrolled_asm...   72 # RSI: Y
ch11_unrolled_asm...   73 # RDX: XY
ch11_unrolled_asm...   74 # RCX: L
ch11_unrolled_asm...   75 .global x86_64_comba_unrolled
ch11_unrolled_asm...   76 x86_64_comba_unrolled:
ch11_unrolled_asm...   77 push rbx
ch11_unrolled_asm...   78 push r12
ch11_unrolled_asm...   79 
ch11_unrolled_asm...   80 cmp rcx, Karatsuba_Thresh
ch11_unrolled_asm...   81 jne size_fail
ch11_unrolled_asm...   82 
ch11_unrolled_asm...   83 mov rcx, rdx   # RCX := XY
ch11_unrolled_asm...   84 xor r12, r12   # TMP := 0
ch11_unrolled_asm...   85 xor r8,  r8    # A0  := 0
ch11_unrolled_asm...   86 xor r9,  r9    # A1  := 0
ch11_unrolled_asm...   87 xor r10, r10   # A2  := 0
ch11_unrolled_asm...   88 xor rbx, rbx   # N   := 0
ch11_unrolled_asm...   89 
ch11_unrolled_asm...   90 gen_loop_low Karatsuba_Thresh
ch11_unrolled_asm...   91 gen_loop_high Karatsuba_Thresh
ch11_unrolled_asm...   92 col_finish
ch11_unrolled_asm...   93 
ch11_unrolled_asm...   94 pop r12
ch11_unrolled_asm...   95 pop rbx
ch11_unrolled_asm...   96 ret
ch11_unrolled_asm...   97 
ch11_unrolled_asm...   98 size_fail:
ch11_unrolled_asm...   99 ud2