-
Notifications
You must be signed in to change notification settings - Fork 15k
Closed
Labels
Description
I tried the following code, which takes more than 3s to compile using clang 17.0.6. (The code size is also a bit hard to accept.)
oom_manual2.c
int src();
int f1(unsigned int *b) {
#define SWITCH(in1, out1) \
unsigned int out1;\
switch (in1) { \
case 0: out1 = b[0] >> 0; break; \
case 1: out1 = b[0] >> 1; break; \
case 2: out1 = b[0] >> 2; break; \
case 3: out1 = b[0] >> 3; break; \
case 4: out1 = b[0] >> 4; break; \
case 5: out1 = b[0] >> 5; break; \
case 6: out1 = b[0] >> 6; break; \
case 7: out1 = b[0] >> 7; break; \
case 8: out1 = b[0] >> 8; break; \
case 9: out1 = b[0] >> 9; break; \
case 10: out1 = b[0] >> 10; break; \
case 11: out1 = b[0] >> 11; break; \
case 12: out1 = b[0] >> 12; break; \
case 13: out1 = b[0] >> 13; break; \
case 14: out1 = b[0] >> 14; break; \
case 15: out1 = b[0] >> 15; break; \
case 16: out1 = b[0] >> 16; break; \
case 17: out1 = b[0] >> 17; break; \
case 18: out1 = b[0] >> 18; break; \
case 19: out1 = b[0] >> 19; break; \
case 20: out1 = b[0] >> 20; break; \
case 21: out1 = b[0] >> 21; break; \
case 22: out1 = b[0] >> 22; break; \
case 23: out1 = b[0] >> 23; break; \
case 24: out1 = b[0] >> 24; break; \
case 25: out1 = b[0] >> 25; break; \
case 26: out1 = b[0] >> 26; break; \
case 27: out1 = b[0] >> 27; break; \
case 28: out1 = b[0] >> 28; break; \
case 29: out1 = b[0] >> 29; break; \
case 30: out1 = b[0] >> 30; break; \
case 31: out1 = b[0] >> 31; break; \
case 32: out1 = b[1] >> 0; break; \
case 33: out1 = b[1] >> 1; break; \
case 34: out1 = b[1] >> 2; break; \
case 35: out1 = b[1] >> 3; break; \
case 36: out1 = b[1] >> 4; break; \
case 37: out1 = b[1] >> 5; break; \
case 38: out1 = b[1] >> 6; break; \
case 39: out1 = b[1] >> 7; break; \
case 40: out1 = b[1] >> 8; break; \
case 41: out1 = b[1] >> 9; break; \
case 42: out1 = b[1] >> 10; break; \
case 43: out1 = b[1] >> 11; break; \
case 44: out1 = b[1] >> 12; break; \
case 45: out1 = b[1] >> 13; break; \
case 46: out1 = b[1] >> 14; break; \
case 47: out1 = b[1] >> 15; break; \
case 48: out1 = b[1] >> 16; break; \
case 49: out1 = b[1] >> 17; break; \
case 50: out1 = b[1] >> 18; break; \
case 51: out1 = b[1] >> 19; break; \
case 52: out1 = b[1] >> 20; break; \
case 53: out1 = b[1] >> 21; break; \
case 54: out1 = b[1] >> 22; break; \
case 55: out1 = b[1] >> 23; break; \
case 56: out1 = b[1] >> 24; break; \
case 57: out1 = b[1] >> 25; break; \
case 58: out1 = b[1] >> 26; break; \
case 59: out1 = b[1] >> 27; break; \
case 60: out1 = b[1] >> 28; break; \
case 61: out1 = b[1] >> 29; break; \
case 62: out1 = b[1] >> 30; break; \
case 63: out1 = b[1] >> 31; break; \
case 64: out1 = b[2] >> 0; break; \
case 65: out1 = b[2] >> 1; break; \
case 66: out1 = b[2] >> 2; break; \
case 67: out1 = b[2] >> 3; break; \
case 68: out1 = b[2] >> 4; break; \
case 69: out1 = b[2] >> 5; break; \
case 70: out1 = b[2] >> 6; break; \
case 71: out1 = b[2] >> 7; break; \
case 72: out1 = b[2] >> 8; break; \
case 73: out1 = b[2] >> 9; break; \
case 74: out1 = b[2] >> 10; break; \
case 75: out1 = b[2] >> 11; break; \
case 76: out1 = b[2] >> 12; break; \
case 77: out1 = b[2] >> 13; break; \
case 78: out1 = b[2] >> 14; break; \
case 79: out1 = b[2] >> 15; break; \
case 80: out1 = b[2] >> 16; break; \
case 81: out1 = b[2] >> 17; break; \
case 82: out1 = b[2] >> 18; break; \
case 83: out1 = b[2] >> 19; break; \
case 84: out1 = b[2] >> 20; break; \
case 85: out1 = b[2] >> 21; break; \
case 86: out1 = b[2] >> 22; break; \
case 87: out1 = b[2] >> 23; break; \
case 88: out1 = b[2] >> 24; break; \
case 89: out1 = b[2] >> 25; break; \
case 90: out1 = b[2] >> 26; break; \
case 91: out1 = b[2] >> 27; break; \
case 92: out1 = b[2] >> 28; break; \
case 93: out1 = b[2] >> 29; break; \
case 94: out1 = b[2] >> 30; break; \
case 95: out1 = b[2] >> 31; break; \
case 96: out1 = b[2] >> 0; break; \
case 97: out1 = b[2] >> 1; break; \
case 98: out1 = b[2] >> 2; break; \
case 99: out1 = b[2] >> 3; break; \
case 100: out1 = b[2] >> 4; break; \
case 101: out1 = b[2] >> 5; break; \
case 102: out1 = b[2] >> 6; break; \
case 103: out1 = b[2] >> 7; break; \
case 104: out1 = b[2] >> 8; break; \
case 105: out1 = b[2] >> 9; break; \
case 106: out1 = b[2] >> 10; break; \
case 107: out1 = b[2] >> 11; break; \
case 108: out1 = b[2] >> 12; break; \
case 109: out1 = b[2] >> 13; break; \
case 110: out1 = b[2] >> 14; break; \
case 111: out1 = b[2] >> 15; break; \
case 112: out1 = b[2] >> 16; break; \
case 113: out1 = b[2] >> 17; break; \
case 114: out1 = b[2] >> 18; break; \
case 115: out1 = b[2] >> 19; break; \
case 116: out1 = b[2] >> 20; break; \
case 117: out1 = b[2] >> 21; break; \
case 118: out1 = b[2] >> 22; break; \
case 119: out1 = b[2] >> 23; break; \
case 120: out1 = b[2] >> 24; break; \
case 121: out1 = b[2] >> 25; break; \
case 122: out1 = b[2] >> 26; break; \
case 123: out1 = b[2] >> 27; break; \
case 124: out1 = b[2] >> 28; break; \
case 125: out1 = b[2] >> 29; break; \
case 126: out1 = b[2] >> 30; break; \
case 127: out1 = b[2] >> 31; break; \
default: __builtin_unreachable(); break; \
}
unsigned int r = src();
SWITCH(r >> 1 & 0x7fU, v1);
SWITCH(r >> 27 | (r << 5 & 0x60U), v2);
SWITCH(r >> 2 & 0x7fu, v3);
SWITCH(r >> 9 & 0x7fu, v4);
SWITCH(r >> 16 & 0x7fu, v5);
SWITCH(r >> 23 & 0x7fu, v6);
SWITCH(r >> 30 | (r << 2 & 0x7fu), v7);
SWITCH(r >> 7 & 0x7fu, v8);
SWITCH(r >> 12 & 0x7fu, v9);
SWITCH(r << 4 & 0x7fu, v10);
SWITCH(r >> 10 & 0x7fu, v11);
SWITCH(r >> 24 & 0x7fu, v12);
SWITCH(r >> 31 | (r << 1 & 0x7eu), v13);
SWITCH(r >> 6 & 0x7fu, v14);
SWITCH(r >> 13 & 0x7fu, v15);
SWITCH(r >> 20 & 0x7fu, v16);
SWITCH(r >> 27 | (r << 5 & 0x60u), v17);
SWITCH(r >> 3 & 0x7fu, v18);
SWITCH(r >> 9 & 0x7fu, v19);
SWITCH(r >> 16 & 0x7fu, v20);
SWITCH(r >> 23 & 0x7fu, v21);
SWITCH(r >> 30 | (r << 2 & 0x7cu), v22);
SWITCH(r >> 7 & 0x7fu, v23);
SWITCH(r >> 12 & 0x7fu, v24);
SWITCH(r >> 14 & 0x7fu, v25);
SWITCH(r >> 15 & 0x7fu, v26);
return (v1 | v2 | v3 | v4 | v5 | v6 | v7 | v8 | v9 | v10 | v11 | v12 | v13 |
v14 | v15 | v16 | v17 | v18 | v19 | v20 | v21 | v22 | v23 | v24 | v25 | v26);
}If I change the 127 branch to the default branch, the time returns to normal.
diff --git a/oom_manual2.c b/oom_manual2.c
index aab82cf..7da3f79 100644
--- a/oom_manual2.c
+++ b/oom_manual2.c
@@ -130,8 +130,7 @@ int f1(unsigned int *b) {
case 124: out1 = b[2] >> 28; break; \
case 125: out1 = b[2] >> 29; break; \
case 126: out1 = b[2] >> 30; break; \
- case 127: out1 = b[2] >> 31; break; \
- default: __builtin_unreachable(); break; \
+ default: out1 = b[2] >> 31; break; \
}
unsigned int r = src();
SWITCH(r >> 1 & 0x7fU, v1);The complete command I used is as follows:
$ time clang -O1 oom_manual.c main.c -o oom_manual
clang -O1 oom_manual.c main.c -o oom_manual 0.31s user 0.04s system 100% cpu 0.347 total
$ time clang -O1 oom_manual2.c main.c -o oom_manual2
clang -O1 oom_manual2.c main.c -o oom_manual2 3.72s user 0.24s system 100% cpu 3.962 total
$ ls -l
total 272
-rwxr-xr-x 1 dianqk users 56K Jan 21 16:22 oom_manual
-rwxr-xr-x 1 dianqk users 192K Jan 21 16:22 oom_manual2main.c
int src(void) {
return 0;
}
int main(int argc, char **argv) {
extern int f1(unsigned int *b, unsigned int r);
f1(0, 0);
return 0;
}I've found that the main time-consumption is Early Tail Duplication.
$ time clang -O1 -c oom_manual2.c -emit-llvm -o oom_manual2.bc
clang -O1 -c oom_manual2.c -emit-llvm -o oom_manual2.bc 0.07s user 0.01s system 99% cpu 0.086 total
$ time llc -O1 oom_manual2.bc -filetype=null
llc -O1 oom_manual2.bc -filetype=null 3.65s user 0.22s system 99% cpu 3.868 total
$ llc -O1 oom_manual2.bc -filetype=null -time-trace
$ cat out.time-trace | jq -r '.traceEvents[] | select(.name == "RunPass" or .name == "Total RunPass") | "\(.dur) \(.name) \(.args.detail)"' | sed 's/"//g' | sort -nr | head -n5
3800384 Total RunPass null
1221923 RunPass Early Tail Duplication
1100348 RunPass Eliminate PHI nodes for register allocation
473867 RunPass Register Coalescer
120407 RunPass Live Variable AnalysisBy adding the new TimeTraceScope, I can see that TailDuplicator::tailDuplicateAndUpdate and TailDuplicator::shouldTailDuplicate take up most of the time.
TimeTraceScope
diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp
index 5ed67bd0a121..80401e75c85d 100644
--- a/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -158,6 +158,7 @@ bool TailDuplicator::tailDuplicateAndUpdate(
SmallVectorImpl<MachineBasicBlock*> *DuplicatedPreds,
function_ref<void(MachineBasicBlock *)> *RemovalCallback,
SmallVectorImpl<MachineBasicBlock *> *CandidatePtr) {
+ llvm::TimeTraceScope TimeScope("TailDuplicator::tailDuplicateAndUpdate");
// Save the successors list.
SmallSetVector<MachineBasicBlock *, 8> Succs(MBB->succ_begin(),
MBB->succ_end());
@@ -555,6 +556,7 @@ void TailDuplicator::updateSuccessorsPHIs(
/// Determine if it is profitable to duplicate this block.
bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
MachineBasicBlock &TailBB) {
+ llvm::TimeTraceScope TimeScope("TailDuplicator::shouldTailDuplicate");
// When doing tail-duplication during layout, the block ordering is in flux,
// so canFallThrough returns a result based on incorrect information and
// should just be ignored.The original question is as follows (I've abbreviated it a bit)
int src();
int f1(unsigned int *b) {
#define SWITCH(in1, out1) \
unsigned int out1;\
switch (in1) { \
case 0: out1 = b[0] >> 0; break; \
...
case 126: out1 = b[2] >> 30; break; \
default: out1 = b[2] >> 31; break; \
}
unsigned int r = src();
SWITCH(r >> 1 & 0x7fU, v1);
...
SWITCH(r >> 15 & 0x7fu, v26);
return (v1 | v2 | v3 | v4 | v5 | v6 | v7 | v8 | v9 | v10 | v11 | v12 | v13 |
v14 | v15 | v16 | v17 | v18 | v19 | v20 | v21 | v22 | v23 | v24 | v25 | v26);
}
OLD=`/usr/bin/time -f '%M' clang_before -cc1 -O1 -emit-obj oom_manual.cc -o /dev/null 2>&1`
NEW=`/usr/bin/time -f '%M' clang_after -cc1 -O1 -emit-obj oom_manual.cc -o /dev/null 2>&1`
echo $OLD
echo $NEW
expr $OLD "*" 10 "<" $NEW > /dev/null
$ ./test_manual.sh
119816
1248148
$ echo $?
0
Originally posted by @dwblaikie in #76669 (comment)