tree checksum vpatch file split hunks
all signers: asciilifeform bvt diana_coman
antecedents: ffa_ch11_tuning_and_api.kv
press order:
patch:
(24 . 7)(24 . 7)
10 type Mode_Type is ("debug", "release");
11 Mode : Mode_Type := external ("mode", "release");
12
13 for Languages use ("Ada");
14 for Languages use ("Ada", "Asm");
15 for Source_Dirs use (".");
16 for Library_Dir use "lib";
17 for Library_Name use "FFA";
- A819415BC60308FE0B550EEE13DA2D6D4819064E4D336E9766E65A63768AEF24AC7DB9F5AC238725587F4660C2992AE17ABC85EF5047D62BEB04BA2C12D1172A(25 . 6)(25 . 24)- E85E9FC6E391E1332EC7AA9BBF4331BBA8E462D5C1996B497696C12BB26097F1A3E4F97A342C868C30534FF648D12C2989875ADDE9233507A656C2B28742418F
22
23 package body FZ_Mul is
24
25 -- Comba's multiplier fastpath. (CAUTION: UNBUFFERED)
26 procedure FZ_Mul_Comba_Fast(X : in FZ;
27 Y : in FZ;
28 XY : out FZ)
29 is
30 procedure Asm_Comba(X : in FZ;
31 Y : in FZ;
32 XY : out FZ;
33 L : in Word_Index);
34 pragma Import (C, Asm_Comba, "x86_64_comba_unrolled");
35 begin
36 pragma Assert(X'Length = Karatsuba_Thresh and
37 Y'Length = Karatsuba_Thresh and
38 XY'Length = 2*Karatsuba_Thresh);
39 Asm_Comba(X, Y, XY, X'Length);
40 end FZ_Mul_Comba_Fast;
41
42
43 -- Comba's multiplier. (CAUTION: UNBUFFERED)
44 procedure FZ_Mul_Comba(X : in FZ;
45 Y : in FZ;
(235 . 11)(253 . 14)
47
48 begin
49
50 if L <= Karatsuba_Thresh then
51
52 -- Base case:
53 FZ_Mul_Comba(X, Y, XY);
54 if L = Karatsuba_Thresh then
55
56 -- Optimized case:
57 FZ_Mul_Comba_Fast(X, Y, XY);
58 elsif L < Karatsuba_Thresh then
59
60 -- Base case
61 FZ_Mul_Comba(X, Y, XY);
62 else
63
64 -- Recursive case:
(25 . 7)(25 . 9)-
69 pragma Pure;
70
71 -- Karatsuba Threshhold - at or below this many Words, we use Comba mult.
72 Karatsuba_Thresh : constant Indices := 8;
73 -- Edit the Karatsuba_Thresh in x86_64_comba.s as well after changing this
74 -- value.
75 Karatsuba_Thresh : constant Indices := 32;
76
77 -- Multiply. (CAUTION: UNBUFFERED)
78 procedure FZ_Multiply_Unbuffered(X : in FZ;
(33 . 6)(35 . 12)
80 XY : out FZ);
81 pragma Inline_Always(FZ_Multiply_Unbuffered);
82
83 -- Comba's multiplier in assembly (fastpath). (CAUTION: UNBUFFERED)
84 procedure FZ_Mul_Comba_Fast(X : in FZ;
85 Y : in FZ;
86 XY : out FZ);
87 pragma Inline_Always(FZ_Mul_Comba_Fast);
88
89 -- Comba's multiplier. (CAUTION: UNBUFFERED)
90 procedure FZ_Mul_Comba(X : in FZ;
91 Y : in FZ;
(0 . 0)(1 . 99)
96 .intel_mnemonic
97 .intel_syntax noprefix
98
99 # unrolled comba variant
100
101 # Register allocation
102 # R8 = A0
103 # R9 = A1
104 # R10 = A2
105 # R11 = J,U
106 # R12 = I in second (higher part) loop of comba
107 # RBX = N in col = I in comba
108 # RDI = X
109 # RSI = Y
110 # RCX = L at x86_64_comba_unrolled entry, XY later
111 # RAX, RDX = (temporary)
112
113 .macro gen_col_inner I NIter
114 .if \NIter - \I
115 gen_col_inner "(\I + 1)" \NIter
116 .endif
117 lea rdx, [rsi + 8*rbx] # rdx := Y'Address + N*8
118 lea rax, [8*r11] # rax := 8*j
119 sub rdx, rax # rdx := rdx - j*8
120 mov rdx, [rdx] # rdx := *(rdx)
121 mov rax, [rdi + 8*r11] # rax := X(j) := *(X'Address + j*8)
122 mul rdx # rdx:rax := rax*rdx
123 add r8, rax # A0, C := A0 + rax
124 adc r9, rdx # A1, C := A1 + rdx + C
125 adc r10, 0 # A2, [C=0] := A2 + 0 + C
126 inc r11 # J := J + 1
127 .endm
128
129 .macro col_finish
130 mov [rcx + 8*rbx], r8 # XY(N) := A0
131 mov r8, r9 # A0 := A1
132 mov r9, r10 # A1 := A2
133 xor r10, r10 # A2 := 0
134 inc rbx # N := N + 1
135 .endm
136
137 .macro gen_col NIter
138 gen_col_inner 0 \NIter
139 col_finish
140 .endm
141
142 .macro gen_loop_low L
143 .if \L
144 gen_loop_low "(\L-1)"
145 xor r11, r11 # U := 0
146 gen_col \L-1
147 .endif
148 .endm
149
150 .macro gen_loop_high_inner I L
151 .if \L-\I
152 inc r12 # I := I + 1
153 mov r11, r12 # U := I (U in col)
154 gen_col "(\L-1-\I)"
155 gen_loop_high_inner "(\I+1)" \L
156 .endif
157 .endm
158
159 .macro gen_loop_high L
160 gen_loop_high_inner 1 \L
161 .endm
162
163 .equiv Karatsuba_Thresh, 32
164
165 # Arguments
166 # RDI: X
167 # RSI: Y
168 # RDX: XY
169 # RCX: L
170 .global x86_64_comba_unrolled
171 x86_64_comba_unrolled:
172 push rbx
173 push r12
174
175 cmp rcx, Karatsuba_Thresh
176 jne size_fail
177
178 mov rcx, rdx # RCX := XY
179 xor r12, r12 # TMP := 0
180 xor r8, r8 # A0 := 0
181 xor r9, r9 # A1 := 0
182 xor r10, r10 # A2 := 0
183 xor rbx, rbx # N := 0
184
185 gen_loop_low Karatsuba_Thresh
186 gen_loop_high Karatsuba_Thresh
187 col_finish
188
189 pop r12
190 pop rbx
191 ret
192
193 size_fail:
194 ud2