.intel_mnemonic .intel_syntax noprefix # Register allocation # R8 = A0 # R9 = A1 # R10 = A2 # R11 = J,U # R12 = V # RBX = N in col = I in comba # RDI = X # RSI = Y # R13 = XY # RCX = upper bound of the foreach-column loops # RAX, RDX = (temporary) col: cmp r11, r12 # exit when J > V jg col_output # ... lea rdx, [rsi + 8*rbx] # rdx := Y'Address + N*8 lea rax, [8*r11] # rax := j sub rdx, rax # rdx := rdx - j*8 mov rdx, [rdx] # rdx := *(rdx) mov rax, [rdi + 8*r11] # rax := X(j) := *(X'Address + j*8) mul rdx # rdx:rax := rax*rdx add r8, rax # A0, C := A0 + rax adc r9, rdx # A1, C := A1 + rdx + C adc r10, 0 # A2, [C=0] := A2 + 0 + C inc r11 # J := J + 1 jmp col col_output: mov [r13 + 8*rbx], r8 # XY(N) := A0 mov r8, r9 # A0 := A1 mov r9, r10 # A1 := A2 xor r10, r10 # A2 := 0 ret # Arguments according to SysV ABI # RDI: X, array of words size X'Size elements # RSI: Y, array of words size X'Size elements # RDX: XY, array of words size 2*X'Size elements # RCX: X'Size, base FZ length (for X and Y). .global x86_64_comba x86_64_comba: push rbx push r12 push r13 mov r13, rdx # RDX is used by MUL, move XY to a free register xor r8, r8 # A0 := 0 xor r9, r9 # A1 := 0 xor r10, r10 # A2 := 0 xor rbx, rbx # I := 0 loop_1: cmp rbx, rcx # exit when I >= L jge end_loop_1 # ... xor r11, r11 # U := 0 mov r12, rbx # V := I call col # inc rbx # I := I + 1 jmp loop_1 end_loop_1: # rbx = L after the previous loop lea r12, [rcx - 1] # V = L - 1 mov rcx, r12 # RCX := L - 1 shl rcx, 1 # RCX := (L - 1)*2 loop_2: cmp rbx, rcx # exit when I > 2*L-2 jg end_loop_2 # ... mov r11, rbx # U := I sub r11, r12 # U := I - V := I - L + 1 call col # V already set to L - 1 inc rbx # I := I + 1 jmp loop_2 end_loop_2: mov [r13 + 8*rbx], r8 # XY(I) := A0 end_comba: pop r13 pop r12 pop rbx ret