Skip to content

Commit ee2f19b

Browse files
committed
Fix inaccuracy in cunroll/cunrolli when considering what's innermost loop.
r15-919-gef27b91b62c3aa removed 1 / 3 size reduction for innermost loop, but it doesn't accurately remember what's "innermost" for 2 testcases in PR117888. 1) For pass_cunroll, the "innermost" loop could be an originally outer loop with inner loop completely unrolled by cunrolli. The patch moves local variable cunrolli to parameter of tree_unroll_loops_completely and passes it directly from execute of the pass. 2) For pass_cunrolli, cunrolli is set to false when the sibling loop of a innermost loop is completely unrolled, and it inaccurately takes the innermost loop as an "outer" loop. The patch add another paramter innermost to helps recognizing the "original" innermost loop. gcc/ChangeLog: PR tree-optimization/117888 * tree-ssa-loop-ivcanon.cc (try_unroll_loop_completely): Use cunrolli instead of cunrolli && !loop->inner to check if it's innermost loop. (canonicalize_loop_induction_variables): Add new parameter const_sbitmap innermost, and pass cunrolli && (unsigned) loop->num < SBITMAP_SIZE (innermost) && bitmap_bit_p (innermost, loop->num) as "cunrolli" to try_unroll_loop_completely (canonicalize_induction_variables): Pass innermost to canonicalize_loop_induction_variables. (tree_unroll_loops_completely_1): Add new parameter const_sbitmap innermost. (tree_unroll_loops_completely): Move local variable cunrolli to parameter to indicate it's from pass cunrolli, also track all "original" innermost loop at the beginning. gcc/testsuite/ChangeLog: * gcc.dg/pr117888-2.c: New test. * gcc.dg/vect/pr117888-1.c: Ditto. * gcc.dg/tree-ssa/pr83403-1.c: Add --param max-completely-peeled-insns=300 for arm*-*-*. * gcc.dg/tree-ssa/pr83403-2.c: Ditto.
1 parent d5b3d9e commit ee2f19b

File tree

5 files changed

+151
-19
lines changed

5 files changed

+151
-19
lines changed

gcc/testsuite/gcc.dg/pr117888-2.c

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
/* { dg-do compile } */
2+
/* { dg-options "-O3 -funroll-loops -fno-tree-vectorize -fdump-tree-cunroll-details" } */
3+
4+
typedef struct {
5+
double real;
6+
double imag;
7+
} complex;
8+
9+
typedef struct { complex e[3][3]; } su3_matrix;
10+
11+
void mult_su3_nn( su3_matrix *a, su3_matrix *b, su3_matrix *c )
12+
{
13+
int i,j;
14+
double t,ar,ai,br,bi,cr,ci;
15+
for(i=0;i<3;i++)
16+
for(j=0;j<3;j++){
17+
18+
ar=a->e[i][0].real; ai=a->e[i][0].imag;
19+
br=b->e[0][j].real; bi=b->e[0][j].imag;
20+
cr=ar*br; t=ai*bi; cr -= t;
21+
ci=ar*bi; t=ai*br; ci += t;
22+
23+
ar=a->e[i][1].real; ai=a->e[i][1].imag;
24+
br=b->e[1][j].real; bi=b->e[1][j].imag;
25+
t=ar*br; cr += t; t=ai*bi; cr -= t;
26+
t=ar*bi; ci += t; t=ai*br; ci += t;
27+
28+
ar=a->e[i][2].real; ai=a->e[i][2].imag;
29+
br=b->e[2][j].real; bi=b->e[2][j].imag;
30+
t=ar*br; cr += t; t=ai*bi; cr -= t;
31+
t=ar*bi; ci += t; t=ai*br; ci += t;
32+
33+
c->e[i][j].real=cr;
34+
c->e[i][j].imag=ci;
35+
}
36+
}
37+
/* { dg-final { scan-tree-dump-times "optimized: loop with 2 iterations completely unrolled" 1 "cunroll" { target i?86-*-* x86_64-*-* } } } */

gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
/* { dg-do compile } */
22
/* { dg-options "-O3 -funroll-loops -fdump-tree-lim2-details" } */
33
/* { dg-additional-options "--param max-completely-peeled-insns=200" { target { s390*-*-* } } } */
4+
/* { dg-additional-options "--param max-completely-peeled-insns=300" { target { arm*-*-* } } } */
45

56
#define TYPE unsigned int
67

gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
/* { dg-do compile } */
22
/* { dg-options "-O3 -funroll-loops -fdump-tree-lim2-details" } */
33
/* { dg-additional-options "--param max-completely-peeled-insns=200" { target { s390*-*-* } } } */
4+
/* { dg-additional-options "--param max-completely-peeled-insns=300" { target { arm*-*-* } } } */
45

56
#define TYPE int
67

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
/* { dg-do compile } */
2+
/* { dg-options "-O3 -funroll-loops -fdump-tree-vect-details" } */
3+
/* { dg-require-effective-target vect_int } */
4+
/* { dg-require-effective-target vect_shift } */
5+
/* { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } } */
6+
/* { dg-additional-options "--param max-completely-peeled-insns=200" { target powerpc64*-*-* } } */
7+
8+
typedef unsigned short ggml_fp16_t;
9+
static float table_f32_f16[1 << 16];
10+
11+
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
12+
unsigned short s;
13+
__builtin_memcpy(&s, &f, sizeof(unsigned short));
14+
return table_f32_f16[s];
15+
}
16+
17+
typedef struct {
18+
ggml_fp16_t d;
19+
ggml_fp16_t m;
20+
unsigned char qh[4];
21+
unsigned char qs[32 / 2];
22+
} block_q5_1;
23+
24+
typedef struct {
25+
float d;
26+
float s;
27+
char qs[32];
28+
} block_q8_1;
29+
30+
void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
31+
const int qk = 32;
32+
const int nb = n / qk;
33+
34+
const block_q5_1 * restrict x = vx;
35+
const block_q8_1 * restrict y = vy;
36+
37+
float sumf = 0.0;
38+
39+
for (int i = 0; i < nb; i++) {
40+
unsigned qh;
41+
__builtin_memcpy(&qh, x[i].qh, sizeof(qh));
42+
43+
int sumi = 0;
44+
45+
if (qh) {
46+
for (int j = 0; j < qk/2; ++j) {
47+
const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
48+
const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
49+
50+
const int x0 = (x[i].qs[j] & 0xF) | xh_0;
51+
const int x1 = (x[i].qs[j] >> 4) | xh_1;
52+
53+
sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
54+
}
55+
}
56+
else {
57+
for (int j = 0; j < qk/2; ++j) {
58+
const int x0 = (x[i].qs[j] & 0xF);
59+
const int x1 = (x[i].qs[j] >> 4);
60+
61+
sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
62+
}
63+
}
64+
65+
sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
66+
}
67+
68+
*s = sumf;
69+
}
70+
71+
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */

gcc/tree-ssa-loop-ivcanon.cc

Lines changed: 41 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -939,8 +939,7 @@ try_unroll_loop_completely (class loop *loop,
939939
1) It could increase register pressure.
940940
2) Big loop after completely unroll may not be vectorized
941941
by BB vectorizer. */
942-
else if ((cunrolli && !loop->inner
943-
? unr_insns : unr_insns - est_eliminated)
942+
else if ((cunrolli ? unr_insns : unr_insns - est_eliminated)
944943
> (unsigned) param_max_completely_peeled_insns)
945944
{
946945
if (dump_file && (dump_flags & TDF_DETAILS))
@@ -1248,7 +1247,9 @@ try_peel_loop (class loop *loop,
12481247
static bool
12491248
canonicalize_loop_induction_variables (class loop *loop,
12501249
bool create_iv, enum unroll_level ul,
1251-
bool try_eval, bool allow_peel, bool cunrolli)
1250+
bool try_eval, bool allow_peel,
1251+
const_sbitmap innermost,
1252+
bool cunrolli)
12521253
{
12531254
edge exit = NULL;
12541255
tree niter;
@@ -1334,8 +1335,15 @@ canonicalize_loop_induction_variables (class loop *loop,
13341335
modified |= remove_redundant_iv_tests (loop);
13351336

13361337
dump_user_location_t locus = find_loop_location (loop);
1338+
1339+
bool innermost_cunrolli_p
1340+
= cunrolli
1341+
&& (unsigned) loop->num < SBITMAP_SIZE (innermost)
1342+
&& bitmap_bit_p (innermost, loop->num);
1343+
13371344
if (try_unroll_loop_completely (loop, exit, niter, may_be_zero, ul,
1338-
maxiter, locus, allow_peel, cunrolli))
1345+
maxiter, locus, allow_peel,
1346+
innermost_cunrolli_p))
13391347
return true;
13401348

13411349
if (create_iv
@@ -1372,14 +1380,19 @@ canonicalize_induction_variables (void)
13721380
bool changed = false;
13731381
bool irred_invalidated = false;
13741382
bitmap loop_closed_ssa_invalidated = BITMAP_ALLOC (NULL);
1383+
auto_sbitmap innermost (number_of_loops (cfun));
1384+
bitmap_clear (innermost);
13751385

13761386
estimate_numbers_of_iterations (cfun);
13771387

13781388
for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
13791389
{
1380-
changed |= canonicalize_loop_induction_variables (loop,
1381-
true, UL_SINGLE_ITER,
1382-
true, false, false);
1390+
changed
1391+
|= canonicalize_loop_induction_variables (loop,
1392+
true, UL_SINGLE_ITER,
1393+
true, false,
1394+
(const_sbitmap) innermost,
1395+
false);
13831396
}
13841397
gcc_assert (!need_ssa_update_p (cfun));
13851398

@@ -1413,7 +1426,8 @@ canonicalize_induction_variables (void)
14131426

14141427
static bool
14151428
tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
1416-
bitmap father_bbs, class loop *loop, bool cunrolli)
1429+
bitmap father_bbs, class loop *loop,
1430+
const_sbitmap innermost, bool cunrolli)
14171431
{
14181432
class loop *loop_father;
14191433
bool changed = false;
@@ -1431,7 +1445,8 @@ tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
14311445
if (!child_father_bbs)
14321446
child_father_bbs = BITMAP_ALLOC (NULL);
14331447
if (tree_unroll_loops_completely_1 (may_increase_size, unroll_outer,
1434-
child_father_bbs, inner, cunrolli))
1448+
child_father_bbs, inner,
1449+
innermost, cunrolli))
14351450
{
14361451
bitmap_ior_into (father_bbs, child_father_bbs);
14371452
bitmap_clear (child_father_bbs);
@@ -1477,7 +1492,8 @@ tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
14771492
ul = UL_NO_GROWTH;
14781493

14791494
if (canonicalize_loop_induction_variables
1480-
(loop, false, ul, !flag_tree_loop_ivcanon, unroll_outer, cunrolli))
1495+
(loop, false, ul, !flag_tree_loop_ivcanon, unroll_outer,
1496+
innermost, cunrolli))
14811497
{
14821498
/* If we'll continue unrolling, we need to propagate constants
14831499
within the new basic blocks to fold away induction variable
@@ -1503,19 +1519,28 @@ tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
15031519

15041520
/* Unroll LOOPS completely if they iterate just few times. Unless
15051521
MAY_INCREASE_SIZE is true, perform the unrolling only if the
1506-
size of the code does not increase. */
1522+
size of the code does not increase.
1523+
cunrolli is true when passs is cunrolli. */
15071524

15081525
static unsigned int
1509-
tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer)
1526+
tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer, bool cunrolli)
15101527
{
15111528
bitmap father_bbs = BITMAP_ALLOC (NULL);
15121529
bool changed;
15131530
int iteration = 0;
15141531
bool irred_invalidated = false;
1515-
bool cunrolli = true;
1532+
auto_sbitmap innermost (number_of_loops (cfun));
1533+
bitmap_clear (innermost);
15161534

15171535
estimate_numbers_of_iterations (cfun);
15181536

1537+
/* Mark all innermost loop at the begining. */
1538+
for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
1539+
{
1540+
if (!loop->inner)
1541+
bitmap_set_bit (innermost, loop->num);
1542+
}
1543+
15191544
do
15201545
{
15211546
changed = false;
@@ -1530,14 +1555,11 @@ tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer)
15301555
changed = tree_unroll_loops_completely_1 (may_increase_size,
15311556
unroll_outer, father_bbs,
15321557
current_loops->tree_root,
1558+
(const_sbitmap) innermost,
15331559
cunrolli);
15341560
if (changed)
15351561
{
15361562
unsigned i;
1537-
/* For the outer loop, considering that the inner loop is completely
1538-
unrolled, it would expose more optimization opportunities, so it's
1539-
better to keep 2/3 reduction of estimated unrolled size. */
1540-
cunrolli = false;
15411563

15421564
unloop_loops (loops_to_unloop, loops_to_unloop_nunroll,
15431565
edges_to_remove, loop_closed_ssa_invalidated,
@@ -1697,7 +1719,7 @@ pass_complete_unroll::execute (function *fun)
16971719
re-peeling the same loop multiple times. */
16981720
if (flag_peel_loops)
16991721
peeled_loops = BITMAP_ALLOC (NULL);
1700-
unsigned int val = tree_unroll_loops_completely (flag_cunroll_grow_size, true);
1722+
unsigned int val = tree_unroll_loops_completely (flag_cunroll_grow_size, true, false);
17011723
if (peeled_loops)
17021724
{
17031725
BITMAP_FREE (peeled_loops);
@@ -1753,7 +1775,7 @@ pass_complete_unrolli::execute (function *fun)
17531775
if (number_of_loops (fun) > 1)
17541776
{
17551777
scev_initialize ();
1756-
ret = tree_unroll_loops_completely (optimize >= 3, false);
1778+
ret = tree_unroll_loops_completely (optimize >= 3, false, true);
17571779
scev_finalize ();
17581780
}
17591781
loop_optimizer_finalize ();

0 commit comments

Comments
 (0)