From ff30fa7ab05b7dacb94620a480f37e32d8efe5b3 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 6 May 2024 23:12:30 +0100 Subject: [PATCH] [X86] Enable TuningSlowDivide64 on Barcelona/Bobcat/Bulldozer/Ryzen Families Despite most AMD cpus having a lower latency for i64 divisions that converge early, we are still better off testing for values representable as i32 and performing a i32 division if possible. All AMD cpus appear to have been missed when we added the "idivq-to-divl" attribute - now matches most Intel cpu behaviour (and the x86-64/v2/3/4 levels). Unfortunately the difference in code scheduling means I've had to stop using the update_llc_test_checks script and just use a old-fashing CHECK-DAG check for divl/divq pairs. Fixes #90985 --- llvm/lib/Target/X86/X86.td | 5 + .../CodeGen/X86/bypass-slow-division-64.ll | 95 +++++++++---------- 2 files changed, 52 insertions(+), 48 deletions(-) diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 25ab08187cf15f..9f5b58d78fcce7 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -1350,6 +1350,7 @@ def ProcessorFeatures { FeatureCMOV, FeatureX86_64]; list BarcelonaTuning = [TuningFastScalarShiftMasks, + TuningSlowDivide64, TuningSlowSHLD, TuningSBBDepBreaking, TuningInsertVZEROUPPER]; @@ -1372,6 +1373,7 @@ def ProcessorFeatures { list BtVer1Tuning = [TuningFast15ByteNOP, TuningFastScalarShiftMasks, TuningFastVectorShiftMasks, + TuningSlowDivide64, TuningSlowSHLD, TuningFastImm16, TuningSBBDepBreaking, @@ -1396,6 +1398,7 @@ def ProcessorFeatures { TuningFastMOVBE, TuningFastImm16, TuningSBBDepBreaking, + TuningSlowDivide64, TuningSlowSHLD]; list BtVer2Features = !listconcat(BtVer1Features, BtVer2AdditionalFeatures); @@ -1420,6 +1423,7 @@ def ProcessorFeatures { FeatureLWP, FeatureLAHFSAHF64]; list BdVer1Tuning = [TuningSlowSHLD, + TuningSlowDivide64, TuningFast11ByteNOP, TuningFastScalarShiftMasks, TuningBranchFusion, @@ -1500,6 +1504,7 @@ def ProcessorFeatures { TuningFastVariablePerLaneShuffle, TuningFastMOVBE, TuningFastImm16, + TuningSlowDivide64, TuningSlowSHLD, TuningSBBDepBreaking, TuningInsertVZEROUPPER, diff --git a/llvm/test/CodeGen/X86/bypass-slow-division-64.ll b/llvm/test/CodeGen/X86/bypass-slow-division-64.ll index 66d7082d9b7c55..6e0cfdd26a7866 100644 --- a/llvm/test/CodeGen/X86/bypass-slow-division-64.ll +++ b/llvm/test/CodeGen/X86/bypass-slow-division-64.ll @@ -1,4 +1,3 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; Check that 64-bit division is bypassed correctly. ; RUN: llc < %s -mtriple=x86_64-- -mattr=-idivq-to-divl | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ ; RUN: llc < %s -mtriple=x86_64-- -mattr=+idivq-to-divl | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ @@ -13,17 +12,17 @@ ; RUN: llc < %s -mtriple=x86_64-- -mcpu=skylake | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ ; RUN: llc < %s -mtriple=x86_64-- -mcpu=alderlake | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ ; AMD -; RUN: llc < %s -mtriple=x86_64-- -mcpu=barcelona | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ -; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver1 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ -; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ -; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver1 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ -; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver2 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ -; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver3 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ -; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver4 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ -; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ -; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ -; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ -; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ +; RUN: llc < %s -mtriple=x86_64-- -mcpu=barcelona | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ +; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver1 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ +; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ +; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver1 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ +; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver2 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ +; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver3 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ +; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver4 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ +; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ +; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ +; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ +; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ ; Additional tests for 64-bit divide bypass @@ -41,18 +40,18 @@ define i64 @sdiv_quotient(i64 %a, i64 %b) nounwind { ; ; SLOW-DIVQ-LABEL: sdiv_quotient: ; SLOW-DIVQ: # %bb.0: -; SLOW-DIVQ-NEXT: movq %rdi, %rax -; SLOW-DIVQ-NEXT: movq %rdi, %rcx -; SLOW-DIVQ-NEXT: orq %rsi, %rcx -; SLOW-DIVQ-NEXT: shrq $32, %rcx +; SLOW-DIVQ-DAG: movq %rdi, %rax +; SLOW-DIVQ-DAG: movq %rdi, %rcx +; SLOW-DIVQ-DAG: orq %rsi, %rcx +; SLOW-DIVQ-DAG: shrq $32, %rcx ; SLOW-DIVQ-NEXT: je .LBB0_1 ; SLOW-DIVQ-NEXT: # %bb.2: ; SLOW-DIVQ-NEXT: cqto ; SLOW-DIVQ-NEXT: idivq %rsi ; SLOW-DIVQ-NEXT: retq ; SLOW-DIVQ-NEXT: .LBB0_1: -; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax killed $rax -; SLOW-DIVQ-NEXT: xorl %edx, %edx +; SLOW-DIVQ-DAG: # kill: def $eax killed $eax killed $rax +; SLOW-DIVQ-DAG: xorl %edx, %edx ; SLOW-DIVQ-NEXT: divl %esi ; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax def $rax ; SLOW-DIVQ-NEXT: retq @@ -93,10 +92,10 @@ define i64 @sdiv_remainder(i64 %a, i64 %b) nounwind { ; ; SLOW-DIVQ-LABEL: sdiv_remainder: ; SLOW-DIVQ: # %bb.0: -; SLOW-DIVQ-NEXT: movq %rdi, %rax -; SLOW-DIVQ-NEXT: movq %rdi, %rcx -; SLOW-DIVQ-NEXT: orq %rsi, %rcx -; SLOW-DIVQ-NEXT: shrq $32, %rcx +; SLOW-DIVQ-DAG: movq %rdi, %rax +; SLOW-DIVQ-DAG: movq %rdi, %rcx +; SLOW-DIVQ-DAG: orq %rsi, %rcx +; SLOW-DIVQ-DAG: shrq $32, %rcx ; SLOW-DIVQ-NEXT: je .LBB3_1 ; SLOW-DIVQ-NEXT: # %bb.2: ; SLOW-DIVQ-NEXT: cqto @@ -104,8 +103,8 @@ define i64 @sdiv_remainder(i64 %a, i64 %b) nounwind { ; SLOW-DIVQ-NEXT: movq %rdx, %rax ; SLOW-DIVQ-NEXT: retq ; SLOW-DIVQ-NEXT: .LBB3_1: -; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax killed $rax -; SLOW-DIVQ-NEXT: xorl %edx, %edx +; SLOW-DIVQ-DAG: # kill: def $eax killed $eax killed $rax +; SLOW-DIVQ-DAG: xorl %edx, %edx ; SLOW-DIVQ-NEXT: divl %esi ; SLOW-DIVQ-NEXT: movl %edx, %eax ; SLOW-DIVQ-NEXT: retq @@ -148,10 +147,10 @@ define i64 @sdiv_quotient_and_remainder(i64 %a, i64 %b) nounwind { ; ; SLOW-DIVQ-LABEL: sdiv_quotient_and_remainder: ; SLOW-DIVQ: # %bb.0: -; SLOW-DIVQ-NEXT: movq %rdi, %rax -; SLOW-DIVQ-NEXT: movq %rdi, %rcx -; SLOW-DIVQ-NEXT: orq %rsi, %rcx -; SLOW-DIVQ-NEXT: shrq $32, %rcx +; SLOW-DIVQ-DAG: movq %rdi, %rax +; SLOW-DIVQ-DAG: movq %rdi, %rcx +; SLOW-DIVQ-DAG: orq %rsi, %rcx +; SLOW-DIVQ-DAG: shrq $32, %rcx ; SLOW-DIVQ-NEXT: je .LBB6_1 ; SLOW-DIVQ-NEXT: # %bb.2: ; SLOW-DIVQ-NEXT: cqto @@ -159,8 +158,8 @@ define i64 @sdiv_quotient_and_remainder(i64 %a, i64 %b) nounwind { ; SLOW-DIVQ-NEXT: addq %rdx, %rax ; SLOW-DIVQ-NEXT: retq ; SLOW-DIVQ-NEXT: .LBB6_1: -; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax killed $rax -; SLOW-DIVQ-NEXT: xorl %edx, %edx +; SLOW-DIVQ-DAG: # kill: def $eax killed $eax killed $rax +; SLOW-DIVQ-DAG: xorl %edx, %edx ; SLOW-DIVQ-NEXT: divl %esi ; SLOW-DIVQ-NEXT: # kill: def $edx killed $edx def $rdx ; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax def $rax @@ -214,18 +213,18 @@ define i64 @udiv_quotient(i64 %a, i64 %b) nounwind { ; ; SLOW-DIVQ-LABEL: udiv_quotient: ; SLOW-DIVQ: # %bb.0: -; SLOW-DIVQ-NEXT: movq %rdi, %rax -; SLOW-DIVQ-NEXT: movq %rdi, %rcx -; SLOW-DIVQ-NEXT: orq %rsi, %rcx -; SLOW-DIVQ-NEXT: shrq $32, %rcx +; SLOW-DIVQ-DAG: movq %rdi, %rax +; SLOW-DIVQ-DAG: movq %rdi, %rcx +; SLOW-DIVQ-DAG: orq %rsi, %rcx +; SLOW-DIVQ-DAG: shrq $32, %rcx ; SLOW-DIVQ-NEXT: je .LBB9_1 ; SLOW-DIVQ-NEXT: # %bb.2: ; SLOW-DIVQ-NEXT: xorl %edx, %edx ; SLOW-DIVQ-NEXT: divq %rsi ; SLOW-DIVQ-NEXT: retq ; SLOW-DIVQ-NEXT: .LBB9_1: -; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax killed $rax -; SLOW-DIVQ-NEXT: xorl %edx, %edx +; SLOW-DIVQ-DAG: # kill: def $eax killed $eax killed $rax +; SLOW-DIVQ-DAG: xorl %edx, %edx ; SLOW-DIVQ-NEXT: divl %esi ; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax def $rax ; SLOW-DIVQ-NEXT: retq @@ -266,10 +265,10 @@ define i64 @udiv_remainder(i64 %a, i64 %b) nounwind { ; ; SLOW-DIVQ-LABEL: udiv_remainder: ; SLOW-DIVQ: # %bb.0: -; SLOW-DIVQ-NEXT: movq %rdi, %rax -; SLOW-DIVQ-NEXT: movq %rdi, %rcx -; SLOW-DIVQ-NEXT: orq %rsi, %rcx -; SLOW-DIVQ-NEXT: shrq $32, %rcx +; SLOW-DIVQ-DAG: movq %rdi, %rax +; SLOW-DIVQ-DAG: movq %rdi, %rcx +; SLOW-DIVQ-DAG: orq %rsi, %rcx +; SLOW-DIVQ-DAG: shrq $32, %rcx ; SLOW-DIVQ-NEXT: je .LBB12_1 ; SLOW-DIVQ-NEXT: # %bb.2: ; SLOW-DIVQ-NEXT: xorl %edx, %edx @@ -277,8 +276,8 @@ define i64 @udiv_remainder(i64 %a, i64 %b) nounwind { ; SLOW-DIVQ-NEXT: movq %rdx, %rax ; SLOW-DIVQ-NEXT: retq ; SLOW-DIVQ-NEXT: .LBB12_1: -; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax killed $rax -; SLOW-DIVQ-NEXT: xorl %edx, %edx +; SLOW-DIVQ-DAG: # kill: def $eax killed $eax killed $rax +; SLOW-DIVQ-DAG: xorl %edx, %edx ; SLOW-DIVQ-NEXT: divl %esi ; SLOW-DIVQ-NEXT: movl %edx, %eax ; SLOW-DIVQ-NEXT: retq @@ -321,10 +320,10 @@ define i64 @udiv_quotient_and_remainder(i64 %a, i64 %b) nounwind { ; ; SLOW-DIVQ-LABEL: udiv_quotient_and_remainder: ; SLOW-DIVQ: # %bb.0: -; SLOW-DIVQ-NEXT: movq %rdi, %rax -; SLOW-DIVQ-NEXT: movq %rdi, %rcx -; SLOW-DIVQ-NEXT: orq %rsi, %rcx -; SLOW-DIVQ-NEXT: shrq $32, %rcx +; SLOW-DIVQ-DAG: movq %rdi, %rax +; SLOW-DIVQ-DAG: movq %rdi, %rcx +; SLOW-DIVQ-DAG: orq %rsi, %rcx +; SLOW-DIVQ-DAG: shrq $32, %rcx ; SLOW-DIVQ-NEXT: je .LBB15_1 ; SLOW-DIVQ-NEXT: # %bb.2: ; SLOW-DIVQ-NEXT: xorl %edx, %edx @@ -332,8 +331,8 @@ define i64 @udiv_quotient_and_remainder(i64 %a, i64 %b) nounwind { ; SLOW-DIVQ-NEXT: addq %rdx, %rax ; SLOW-DIVQ-NEXT: retq ; SLOW-DIVQ-NEXT: .LBB15_1: -; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax killed $rax -; SLOW-DIVQ-NEXT: xorl %edx, %edx +; SLOW-DIVQ-DAG: # kill: def $eax killed $eax killed $rax +; SLOW-DIVQ-DAG: xorl %edx, %edx ; SLOW-DIVQ-NEXT: divl %esi ; SLOW-DIVQ-NEXT: # kill: def $edx killed $edx def $rdx ; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax def $rax