From 8c3e6816149227bd78291b4d5476fd5c3ef51527 Mon Sep 17 00:00:00 2001 From: Julien Villette Date: Wed, 5 Feb 2025 12:07:56 +0100 Subject: [PATCH 1/3] [mca] New option -scheduling-info Outputs micro ops, latency, bypass latency, throughput, llvm opcode name, used resources and parsed assembly instruction with comments. This option is used to compare scheduling info from micro architecture documents. Reference scheduling information (from Architecture and micro architecture) are in comment section after each instruction (// or /* */). These information may be generated from Architecture Description Language. By this way, it is easy to compare information from llvm and from documentation/ADL. LLVM Opcode name help to find right instruction regexp to fix in Target Scheduling Info specification. Example: Input: abs D20, D11 // ABS , \\ ASIMD arith, basic \\ 1 2 2 4.0 V1UnitV Output: 1 | 2 | 2 | 4.00 | V1UnitV | ABSv1i64 | abs d20, d11 // ABS , \\ ASIMD arith, basic \\ 1 2 2 4.0 V1UnitV --- llvm/docs/CommandGuide/llvm-mca.rst | 14 + llvm/include/llvm/MC/MCSchedule.h | 4 + llvm/lib/MC/MCSchedule.cpp | 37 + .../AArch64/Neoverse/V1-scheduling-info.s | 7588 +++++++++++++++++ llvm/tools/llvm-mca/CMakeLists.txt | 1 + .../llvm-mca/Views/InstructionInfoView.h | 1 + .../llvm-mca/Views/SchedulingInfoView.cpp | 212 + .../tools/llvm-mca/Views/SchedulingInfoView.h | 97 + llvm/tools/llvm-mca/llvm-mca.cpp | 42 +- 9 files changed, 7985 insertions(+), 11 deletions(-) create mode 100644 llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-scheduling-info.s create mode 100644 llvm/tools/llvm-mca/Views/SchedulingInfoView.cpp create mode 100644 llvm/tools/llvm-mca/Views/SchedulingInfoView.h diff --git a/llvm/docs/CommandGuide/llvm-mca.rst b/llvm/docs/CommandGuide/llvm-mca.rst index f610ea2f21682..5c945907785ac 100644 --- a/llvm/docs/CommandGuide/llvm-mca.rst +++ b/llvm/docs/CommandGuide/llvm-mca.rst @@ -170,6 +170,20 @@ option specifies "``-``", then the output will also be sent to standard output. Enable extra scheduler statistics. This view collects and analyzes instruction issue events. This view is disabled by default. +.. option:: -scheduling-info + + Enable scheduling info view. This view reports scheduling information defined + in LLVM target description in the form: + uOps | Latency | Bypass Latency | Throughput | LLVM OpcodeName | Resources + units | assembly instruction and its comment (// or /* */) if defined. + It allows to compare scheduling info with architecture documents and fix them + in target description by fixing InstrRW for the reported LLVM opcode. + Scheduling information can be defined in the same order in each instruction + comments to check easily reported and reference scheduling information. + Suggested information in comment: + ``// \\ \\ + , , , , `` + .. option:: -retire-stats Enable extra retire control unit statistics. This view is disabled by default. diff --git a/llvm/include/llvm/MC/MCSchedule.h b/llvm/include/llvm/MC/MCSchedule.h index fe731d086f70a..4e72f633596a3 100644 --- a/llvm/include/llvm/MC/MCSchedule.h +++ b/llvm/include/llvm/MC/MCSchedule.h @@ -402,6 +402,10 @@ struct MCSchedModel { static unsigned getForwardingDelayCycles(ArrayRef Entries, unsigned WriteResourceIdx = 0); + /// Returns the maximum forwarding delay for maximum write latency. + static unsigned getForwardingDelayCycles(const MCSubtargetInfo &STI, + const MCSchedClassDesc &SCDesc); + /// Returns the default initialized model. static const MCSchedModel Default; }; diff --git a/llvm/lib/MC/MCSchedule.cpp b/llvm/lib/MC/MCSchedule.cpp index ed243cecabb76..36147f1fa9983 100644 --- a/llvm/lib/MC/MCSchedule.cpp +++ b/llvm/lib/MC/MCSchedule.cpp @@ -174,3 +174,40 @@ MCSchedModel::getForwardingDelayCycles(ArrayRef Entries, return std::abs(DelayCycles); } + +unsigned +MCSchedModel::getForwardingDelayCycles(const MCSubtargetInfo &STI, + const MCSchedClassDesc &SCDesc) { + + ArrayRef Entries = STI.getReadAdvanceEntries(SCDesc); + if (Entries.empty()) + return 0; + + unsigned Latency = 0; + unsigned maxLatency = 0; + unsigned WriteResourceID = 0; + unsigned DefEnd = SCDesc.NumWriteLatencyEntries; + + for (unsigned DefIdx = 0; DefIdx != DefEnd; ++DefIdx) { + // Lookup the definition's write latency in SubtargetInfo. + const MCWriteLatencyEntry *WLEntry = + STI.getWriteLatencyEntry(&SCDesc, DefIdx); + // Early exit if we found an invalid latency. + // Consider no bypass + if (WLEntry->Cycles < 0) + return 0; + maxLatency = std::max(Latency, static_cast(WLEntry->Cycles)); + if (maxLatency > Latency) { + WriteResourceID = WLEntry->WriteResourceID; + } + Latency = maxLatency; + } + + for (const MCReadAdvanceEntry &E : Entries) { + if (E.WriteResourceID == WriteResourceID) { + return E.Cycles; + } + } + + llvm_unreachable("WriteResourceID not found in MCReadAdvanceEntry entries"); +} diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-scheduling-info.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-scheduling-info.s new file mode 100644 index 0000000000000..c421166f22ea4 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-scheduling-info.s @@ -0,0 +1,7588 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-v1 -scheduling-info < %s | FileCheck %s + + .text + .file "V1-scheduling-info.s" + .globl test + .p2align 4 + .type test,@function +test: + .cfi_startproc + abs D15, D11 /* ABS , \\ ASIMD arith, basic \\ 1 2 2 4.0 V1UnitV */ + abs V25.2S, V25.2S // ABS ., . \\ ASIMD arith, basic \\ 1 2 2 4.0 V1UnitV + abs Z26.B, P6/M, Z27.B // ABS ., /M, . \\ Arithmetic, basic \\ 1 2 2 2.0 V1UnitV01 + adc W13, W6, W4 // ADC , , \\ ALU, basic \\ 1 1 1 4.0 V1UnitI + adc X8, X12, X10 // ADC , , \\ ALU, basic \\ 1 1 1 4.0 V1UnitI + adcs W29, W7, W30 // ADCS , , \\ ALU, basic, flagset \\ 1 1 1 3.00 V1UnitI,V1UnitFlg + adcs X11, X3, X5 // ADCS , , \\ ALU, basic, flagset \\ 1 1 1 3.00 V1UnitI,V1UnitFlg + add WSP, WSP, W10 // ADD , , \\ ALU, basic, unconditional, no flagset \\ 1 2 2 2.00 V1UnitI + add WSP, WSP, W2, UXTB // ADD , , , \\ ALU, basic, unconditional, no flagset \\ 1 2 2 2.00 V1UnitI + add WSP, WSP, W13, UXTH #4 // ADD , , , # \\ ALU, basic, unconditional, no flagset \\ 1 2 2 2.00 V1UnitI + add WSP, WSP, W13, LSL #4 // ADD , , , LSL # \\ Arithmetic, LSL shift, shift <= 4 \\ 1 2 2 2.00 V1UnitI + add X22, X2, X27 // ADD , , X \\ ALU, basic \\ 1 1 1 4.0 V1UnitI + add X25, X9, W25, UXTB // ADD , , , \\ ALU, basic \\ 1 2 2 2.00 V1UnitI + add X4, X28, W3, UXTB #3 // ADD , , , # \\ ALU, extend and shift \\ 1 2 2 2.0 V1UnitM + add X0, X28, X26, LSL #3 // ADD , , X, LSL # \\ Arithmetic, LSL shift, shift <= 4 \\ 1 1 1 4.0 V1UnitI + add WSP, WSP, #3765 // ADD , , # \\ ALU, basic \\ 1 1 1 4.0 V1UnitI + add WSP, WSP, #3547, LSL #12 // ADD , , #, \\ ALU, basic \\ 1 1 1 4.0 V1UnitI + add X7, X30, #803 // ADD , , # \\ ALU, basic \\ 1 1 1 4.0 V1UnitI + add X7, X2, #319, LSL #12 // ADD , , #, \\ ALU, basic \\ 1 1 1 4.0 V1UnitI + add Z13.D, Z13.D, #245 // ADD ., ., # \\ Arithmetic, basic \\ 1 2 2 2.0 V1UnitV01 + add Z16.D, Z16.D, #233, LSL #8 // ADD ., ., #, \\ Arithmetic, basic \\ 1 2 2 2.0 V1UnitV01 + add W3, W2, W21, LSL #3 // ADD , , , LSL # \\ Arithmetic, LSL shift by immed, shift <= 4, unconditional, no flagset \\ 1 1 1 4.0 V1UnitI + add W6, W21, W17, LSL #15 // ADD , , , LSL # \\ Arithmetic, LSR/ASR/ROR shift by immed or LSL shift by immed > 4, unconditional \\ 1 2 2 2.0 V1UnitM + add W28, W30, W19, ASR #30 // ADD , , , # \\ Arithmetic, LSR/ASR/ROR shift by immed or LSL shift by immed > 4, unconditional \\ 1 2 2 2.0 V1UnitM + add X8, X3, X28, LSL #3 // ADD , , , LSL # \\ Arithmetic, LSL shift, shift <= 4 \\ 1 1 1 4.0 V1UnitI + add X12, X13, X0, LSL #44 // ADD , , , LSL # \\ Arithmetic, LSR/ASR/ROR shift or LSL shift > 4 \\ 1 2 2 2.0 V1UnitM + add X5, X20, X28, LSR #16 // ADD , , , # \\ Arithmetic, LSR/ASR/ROR shift or LSL shift > 4 \\ 1 2 2 2.0 V1UnitM + add D0, D23, D21 // ADD , , \\ ASIMD arith, basic \\ 1 2 2 4.0 V1UnitV + add V19.4S, V24.4S, V15.4S // ADD ., ., . \\ ASIMD arith, basic \\ 1 2 2 4.0 V1UnitV + add Z29.D, P5/M, Z29.D, Z29.D // ADD ., /M, ., . \\ Arithmetic, basic \\ 1 2 2 2.0 V1UnitV01 + add Z10.H, Z22.H, Z13.H // ADD ., ., . \\ Arithmetic, basic \\ 1 2 2 2.0 V1UnitV01 + addhn V26.4H, V5.4S, V9.4S // ADDHN ., ., . \\ ASIMD arith, complex \\ 1 2 2 4.0 V1UnitV + addhn2 V1.16B, V19.8H, V6.8H // ADDHN2 ., ., . \\ ASIMD arith, complex \\ 1 2 2 4.0 V1UnitV + addp D1, V14.2D // ADDP , . \\ ASIMD arith, pair-wise \\ 1 2 2 4.0 V1UnitV + addp V7.2S, V1.2S, V2.2S // ADDP ., ., . \\ ASIMD arith, pair-wise \\ 1 2 2 4.0 V1UnitV + addpl X27, X6, #-6 // ADDPL , , # \\ Predicate counting scalar \\ 1 2 2 1.0 V1UnitM0 + adds W17, WSP, W25 // ADDS , , \\ ALU, basic, unconditional, flagset \\ 1 2 2 2.00 V1UnitI,V1UnitFlg + adds W6, WSP, W15, UXTH // ADDS , , , \\ ALU, basic, unconditional, flagset \\ 1 2 2 2.00 V1UnitI,V1UnitFlg + adds W22, WSP, W30, UXTB #2 // ADDS , , , # \\ ALU, basic, unconditional, flagset \\ 1 1 1 3.00 V1UnitI,V1UnitFlg + adds W12, WSP, W29, LSL #4 // ADDS , , , LSL # \\ Arithmetic, LSL shift by immed, shift <= 4, unconditional, flagset \\ 1 2 2 2.00 V1UnitI,V1UnitFlg + adds X14, X0, X10 // ADDS , , X \\ ALU, basic, flagset \\ 1 1 1 3.00 V1UnitI,V1UnitFlg + adds X13, X23, W8, UXTB // ADDS , , , \\ ALU, basic, flagset \\ 1 1 1 3.00 V1UnitI,V1UnitFlg + adds X4, X26, W28, UXTB #1 // ADDS , , , # \\ ALU, flagset, extend and shift \\ 1 1 1 3.00 V1UnitFlg, V1UnitI + adds X10, X3, X29, LSL #2 // ADDS , , X, LSL # \\ Arithmetic, flagset, LSL shift, shift <= 4 \\ 1 1 1 3.00 V1UnitI,V1UnitFlg + adds W23, WSP, #502 // ADDS , , # \\ ALU, basic, unconditional, flagset \\ 1 1 1 3.00 V1UnitI,V1UnitFlg + adds W2, WSP, #2980, LSL #12 // ADDS , , #, \\ Arithmetic, flagset, LSR/ASR/ROR shift by immed or LSL shift by immed > 4, unconditional \\ 1 1 1 3.00 V1UnitFlg, V1UnitI + adds X12, X4, #1345 // ADDS , , # \\ ALU, basic, flagset \\ 1 1 1 3.00 V1UnitI,V1UnitFlg + adds X25, X18, #3037, LSL #12 // ADDS , , #, \\ Arithmetic, flagset, LSR/ASR/ROR shift or LSL shift > 4 \\ 1 1 1 3.00 V1UnitFlg, V1UnitI + adds W12, W13, W26 // ADDS , , \\ ALU, basic, unconditional, flagset \\ 1 1 1 3.00 V1UnitI,V1UnitFlg + adds W0, W23, W20, LSL #0 // ADDS , , , LSL # \\ Arithmetic, LSL shift by immed, shift <= 4, unconditional, flagset \\ 1 1 1 3.00 V1UnitI,V1UnitFlg + adds W13, W16, W12, LSL #28 // ADDS , , , LSL # \\ Arithmetic, flagset, LSR/ASR/ROR shift by immed or LSL shift by immed > 4, unconditional \\ 1 2 2 2.00 V1UnitM,V1UnitFlg + adds W20, W19, W16, ASR #0 // ADDS , , , # \\ Arithmetic, flagset, LSR/ASR/ROR shift by immed or LSL shift by immed > 4, unconditional \\ 1 2 2 2.00 V1UnitM,V1UnitFlg + adds X23, X12, X4 // ADDS , , \\ ALU, basic, flagset \\ 1 1 1 3.00 V1UnitI,V1UnitFlg + adds X0, X13, X4, LSL #2 // ADDS , , , LSL # \\ Arithmetic, flagset, LSL shift, shift <= 4 \\ 1 1 1 3.00 V1UnitI,V1UnitFlg + adds X4, X7, X6, LSL #31 // ADDS , , , LSL # \\ Arithmetic, flagset, LSR/ASR/ROR shift or LSL shift > 4 \\ 1 2 2 2.00 V1UnitM,V1UnitFlg + adds X9, X8, X9, ASR #41 // ADDS , , , # \\ Arithmetic, flagset, LSR/ASR/ROR shift or LSL shift > 4 \\ 1 2 2 2.00 V1UnitM,V1UnitFlg + addv B0, V28.8B // ADDV B, .8B \\ ASIMD arith, reduce, 8B/8H \\ 2 4 4 2.00 V1UnitV13 + addv B1, V26.16B // ADDV B, .16B \\ ASIMD arith, reduce, 16B \\ 2 4 4 1.00 V1UnitV13[2] + addv H18, V13.4H // ADDV H, .4H \\ ASIMD arith, reduce, 4H/4S \\ 1 2 2 2.0 V1UnitV13 + addv H29, V17.8H // ADDV H, .8H \\ ASIMD arith, reduce, 8B/8H \\ 2 4 4 2.00 V1UnitV13 + addv S22, V18.4S // ADDV S, .4S \\ ASIMD arith, reduce, 4H/4S \\ 1 2 2 2.0 V1UnitV13 + addvl X1, X27, #-8 // ADDVL , , # \\ Predicate counting scalar \\ 1 2 2 1.0 V1UnitM0 + adr X3, test // ADR ,