.intel_mnemonic .intel_syntax noprefix # unrolled comba variant # Register allocation # R8 = A0 # R9 = A1 # R10 = A2 # R11 = J,U # R12 = I in second (higher part) loop of comba # RBX = N in col = I in comba # RDI = X # RSI = Y # RCX = L at x86_64_comba_unrolled entry, XY later # RAX, RDX = (temporary) .macro gen_col_inner I NIter .if \NIter - \I gen_col_inner "(\I + 1)" \NIter .endif lea rdx, [rsi + 8*rbx] # rdx := Y'Address + N*8 lea rax, [8*r11] # rax := 8*j sub rdx, rax # rdx := rdx - j*8 mov rdx, [rdx] # rdx := *(rdx) mov rax, [rdi + 8*r11] # rax := X(j) := *(X'Address + j*8) mul rdx # rdx:rax := rax*rdx add r8, rax # A0, C := A0 + rax adc r9, rdx # A1, C := A1 + rdx + C adc r10, 0 # A2, [C=0] := A2 + 0 + C inc r11 # J := J + 1 .endm .macro col_finish mov [rcx + 8*rbx], r8 # XY(N) := A0 mov r8, r9 # A0 := A1 mov r9, r10 # A1 := A2 xor r10, r10 # A2 := 0 inc rbx # N := N + 1 .endm .macro gen_col NIter gen_col_inner 0 \NIter col_finish .endm .macro gen_loop_low L .if \L gen_loop_low "(\L-1)" xor r11, r11 # U := 0 gen_col \L-1 .endif .endm .macro gen_loop_high_inner I L .if \L-\I inc r12 # I := I + 1 mov r11, r12 # U := I (U in col) gen_col "(\L-1-\I)" gen_loop_high_inner "(\I+1)" \L .endif .endm .macro gen_loop_high L gen_loop_high_inner 1 \L .endm .equiv Karatsuba_Thresh, 32 # Arguments # RDI: X # RSI: Y # RDX: XY # RCX: L .global x86_64_comba_unrolled x86_64_comba_unrolled: push rbx push r12 cmp rcx, Karatsuba_Thresh jne size_fail mov rcx, rdx # RCX := XY xor r12, r12 # TMP := 0 xor r8, r8 # A0 := 0 xor r9, r9 # A1 := 0 xor r10, r10 # A2 := 0 xor rbx, rbx # N := 0 gen_loop_low Karatsuba_Thresh gen_loop_high Karatsuba_Thresh col_finish pop r12 pop rbx ret size_fail: ud2