Skip to content

Commit 2283fcb

Browse files
author
Rajalakshmi Srinivasaraghavan
committed
POWER10: Reduce sgemm loop unrolling
With GCC 14, unnecessary move and lxvp instructions appear when unrolling the inner loop for larger sizes. Reducing the loop unroll factor restores performance to GCC 11.
1 parent e4344de commit 2283fcb

File tree

1 file changed

+1
-111
lines changed

1 file changed

+1
-111
lines changed

kernel/power/sgemm_kernel_power10.c

Lines changed: 1 addition & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -245,118 +245,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
245245
AO += 16;
246246
BO += 8;
247247
temp--;
248-
BLASLONG K = temp / 64;
248+
BLASLONG K = temp / 16;
249249
for (l = 0; l < K; l++)
250-
{
251-
vec_t *rowA = (vec_t *) & AO[0];
252-
vec_t *rowB = (vec_t *) & BO[0];
253-
KERNEL (0, 0);
254-
KERNEL (2, 4);
255-
KERNEL (4, 8);
256-
KERNEL (6, 12);
257-
KERNEL (8, 16);
258-
KERNEL (10, 20);
259-
KERNEL (12, 24);
260-
KERNEL (14, 28);
261-
KERNEL (16, 32);
262-
KERNEL (18, 36);
263-
KERNEL (20, 40);
264-
KERNEL (22, 44);
265-
KERNEL (24, 48);
266-
KERNEL (26, 52);
267-
KERNEL (28, 56);
268-
KERNEL (30, 60);
269-
KERNEL (32, 64);
270-
KERNEL (34, 68);
271-
KERNEL (36, 72);
272-
KERNEL (38, 76);
273-
KERNEL (40, 80);
274-
KERNEL (42, 84);
275-
KERNEL (44, 88);
276-
KERNEL (46, 92);
277-
KERNEL (48, 96);
278-
KERNEL (50, 100);
279-
KERNEL (52, 104);
280-
KERNEL (54, 108);
281-
KERNEL (56, 112);
282-
KERNEL (58, 116);
283-
KERNEL (60, 120);
284-
KERNEL (62, 124);
285-
KERNEL (64, 128);
286-
KERNEL (66, 132);
287-
KERNEL (68, 136);
288-
KERNEL (70, 140);
289-
KERNEL (72, 144);
290-
KERNEL (74, 148);
291-
KERNEL (76, 152);
292-
KERNEL (78, 156);
293-
KERNEL (80, 160);
294-
KERNEL (82, 164);
295-
KERNEL (84, 168);
296-
KERNEL (86, 172);
297-
KERNEL (88, 176);
298-
KERNEL (90, 180);
299-
KERNEL (92, 184);
300-
KERNEL (94, 188);
301-
KERNEL (96, 192);
302-
KERNEL (98, 196);
303-
KERNEL (100, 200);
304-
KERNEL (102, 204);
305-
KERNEL (104, 208);
306-
KERNEL (106, 212);
307-
KERNEL (108, 216);
308-
KERNEL (110, 220);
309-
KERNEL (112, 224);
310-
KERNEL (114, 228);
311-
KERNEL (116, 232);
312-
KERNEL (118, 236);
313-
KERNEL (120, 240);
314-
KERNEL (122, 244);
315-
KERNEL (124, 248);
316-
KERNEL (126, 252);
317-
AO += 1024;
318-
BO += 512;
319-
}
320-
if ((temp & 63) >> 5)
321-
{
322-
vec_t *rowA = (vec_t *) & AO[0];
323-
vec_t *rowB = (vec_t *) & BO[0];
324-
KERNEL (0, 0);
325-
KERNEL (2, 4);
326-
KERNEL (4, 8);
327-
KERNEL (6, 12);
328-
KERNEL (8, 16);
329-
KERNEL (10, 20);
330-
KERNEL (12, 24);
331-
KERNEL (14, 28);
332-
KERNEL (16, 32);
333-
KERNEL (18, 36);
334-
KERNEL (20, 40);
335-
KERNEL (22, 44);
336-
KERNEL (24, 48);
337-
KERNEL (26, 52);
338-
KERNEL (28, 56);
339-
KERNEL (30, 60);
340-
KERNEL (32, 64);
341-
KERNEL (34, 68);
342-
KERNEL (36, 72);
343-
KERNEL (38, 76);
344-
KERNEL (40, 80);
345-
KERNEL (42, 84);
346-
KERNEL (44, 88);
347-
KERNEL (46, 92);
348-
KERNEL (48, 96);
349-
KERNEL (50, 100);
350-
KERNEL (52, 104);
351-
KERNEL (54, 108);
352-
KERNEL (56, 112);
353-
KERNEL (58, 116);
354-
KERNEL (60, 120);
355-
KERNEL (62, 124);
356-
AO += 512;
357-
BO += 256;
358-
}
359-
if ((temp & 31) >> 4)
360250
{
361251
vec_t *rowA = (vec_t *) & AO[0];
362252
vec_t *rowB = (vec_t *) & BO[0];

0 commit comments

Comments
 (0)