Skip to content

Commit f8ebcd7

Browse files
[Performance] Dynamic cpu kernel V3 for SpMMSumCsr all Ops (dmlc#2309)
* support AVX512 * env DGL_CPU_INTEL_KERNEL_ENABLED=1 * env DGL_CPU_INTEL_KERNEL_LOG=1 * Add unittest test_spmm.cc Co-authored-by: Izabela Mazur <[email protected]> Co-authored-by: Michal Szarmach <[email protected]> Review patch
1 parent 62b4bbb commit f8ebcd7

File tree

9 files changed

+1003
-212
lines changed

9 files changed

+1003
-212
lines changed

.gitmodules

+3
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,6 @@
2323
[submodule "third_party/thrust"]
2424
path = third_party/thrust
2525
url = https://github.com/NVIDIA/thrust.git
26+
[submodule "third_party/xbyak"]
27+
path = third_party/xbyak
28+
url = https://github.com/herumi/xbyak

CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ include_directories("third_party/dmlc-core/include")
5959
include_directories("third_party/minigun/minigun")
6060
include_directories("third_party/minigun/third_party/moderngpu/src")
6161
include_directories("third_party/phmap/")
62+
include_directories("third_party/xbyak/")
6263

6364
# initial variables
6465
set(DGL_LINKER_LIBS "")

docs/source/env_var.rst

+12
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,15 @@ Data Repository
2222
* ``DGL_DOWNLOAD_DIR``:
2323
* Values: String (default="${HOME}/.dgl")
2424
* The local directory to cache the downloaded data.
25+
26+
Intel CPU Performance Options
27+
---------------
28+
* ``DGL_CPU_INTEL_KERNEL_ENABLED``:
29+
* Values: int (default='0')
30+
* Use dynamic cpu kernels.
31+
* Suggested values: 1
32+
33+
* ``DGL_CPU_INTEL_KERNEL_LOG``:
34+
* Values: int (default='0')
35+
* Show diagnostic message (debug mode).
36+
* Suggested values: 1

include/intel/cpu_support.h

+332
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,332 @@
1+
/*!
2+
* Copyright (c) 2019 by Contributors
3+
* \file intel/cpu_support.h
4+
* \brief Intel CPU support
5+
* \author Pawel Piotrowicz <[email protected]>
6+
*/
7+
#ifndef INTEL_CPU_SUPPORT_H_
8+
#define INTEL_CPU_SUPPORT_H_
9+
#include <memory>
10+
#include <tuple>
11+
#include <type_traits>
12+
#include "dmlc/logging.h"
13+
#include "meta_utils.h"
14+
#include "xbyak/xbyak.h"
15+
#include "xbyak/xbyak_util.h"
16+
17+
namespace dgl {
18+
19+
typedef std::tuple<float, double> supported_types;
20+
21+
#ifndef log_intel
22+
#define log_intel(x) \
23+
if (IntelKernel<>::IsLogEnabled()) { \
24+
LOG(INFO) << x; \
25+
}
26+
#endif
27+
28+
static inline Xbyak::Zmm make_zmm(const Xbyak::Xmm &v) {
29+
return Xbyak::Zmm(v.getIdx());
30+
}
31+
template <int version = 0>
32+
struct IntelKernel {
33+
static int64_t GetValue() {
34+
int64_t v = 0;
35+
const char *label = "DGL_CPU_INTEL_KERNEL_ENABLED";
36+
const char *ptr = std::getenv(label);
37+
if (ptr) {
38+
v = atoll(ptr);
39+
log_intel(label << "=>" << v);
40+
}
41+
return v;
42+
}
43+
44+
static int64_t IsEnabled() {
45+
static int64_t r = IntelKernel<version>::GetValue();
46+
return r;
47+
}
48+
49+
static int IsLogEnabled() {
50+
static int r = (std::getenv("DGL_CPU_INTEL_KERNEL_LOG")) ? 1 : 0;
51+
return r;
52+
}
53+
};
54+
55+
/*!
56+
* \brief Element-wise addition kernel using Intel AVX512 instructions.
57+
* \note it uses AVX512.
58+
*/
59+
template <class Op>
60+
class ElemWiseAddUpdate : public Xbyak::CodeGenerator {
61+
public:
62+
typedef typename Op::type DType;
63+
static_assert(
64+
std::is_base_of<std::true_type,
65+
utils::has_type<DType, supported_types>>::value,
66+
"Use case fail dgl::ElemWiseAddUpdate< Operator<DType> > DType is not "
67+
"supported !");
68+
69+
protected:
70+
const Xbyak::Reg64 &r_out_;
71+
const Xbyak::Reg64 &r_left_;
72+
const Xbyak::Reg64 &r_right;
73+
const Xbyak::Reg64 &r_size_;
74+
75+
/* [functional] Does kernel is applicable on this machine ? */
76+
bool applicable_;
77+
78+
public:
79+
static constexpr int UNIT_SIZE_BYTES = sizeof(DType);
80+
static constexpr int BITS_IN_BYTES = 8;
81+
static constexpr int REG_BIT_SIZE = 512;
82+
static constexpr int UNIT_PER_REG =
83+
REG_BIT_SIZE / (UNIT_SIZE_BYTES * BITS_IN_BYTES);
84+
85+
template <class TType, class R1, class R2,
86+
utils::CheckCmp<TType, float> = true>
87+
void alias_load(R1 r1, R2 r2) {
88+
vmovups(r1, r2);
89+
}
90+
template <class TType, class R1, class R2,
91+
utils::CheckCmp<TType, double> = true>
92+
void alias_load(R1 r1, R2 r2) {
93+
vmovupd(r1, r2);
94+
}
95+
96+
template <class TType, class R1, class R2,
97+
utils::CheckCmp<TType, float> = true>
98+
void alias_save(R1 r1, R2 r2) {
99+
alias_load<TType>(r1, r2);
100+
}
101+
template <class TType, class R1, class R2,
102+
utils::CheckCmp<TType, double> = true>
103+
void alias_save(R1 r1, R2 r2) {
104+
alias_load<TType>(r1, r2);
105+
}
106+
107+
template <class TType, class R1, class R2, class R3,
108+
utils::CheckCmp<TType, float> = true>
109+
void alias_ADD(R1 r1, R2 r2, R3 r3) {
110+
vaddps(r1, r2, r3);
111+
}
112+
template <class TType, class R1, class R2, class R3,
113+
utils::CheckCmp<TType, double> = true>
114+
void alias_ADD(R1 r1, R2 r2, R3 r3) {
115+
vaddpd(r1, r2, r3);
116+
}
117+
118+
template <class TType, class R1, class R2, class R3,
119+
utils::CheckCmp<TType, float> = true>
120+
void alias_SUB(R1 r1, R2 r2, R3 r3) {
121+
vsubps(r1, r2, r3);
122+
}
123+
template <class TType, class R1, class R2, class R3,
124+
utils::CheckCmp<TType, double> = true>
125+
void alias_SUB(R1 r1, R2 r2, R3 r3) {
126+
vsubpd(r1, r2, r3);
127+
}
128+
129+
template <class TType, class R1, class R2, class R3,
130+
utils::CheckCmp<TType, float> = true>
131+
void alias_DIV(R1 r1, R2 r2, R3 r3) {
132+
vdivps(r1, r2, r3);
133+
}
134+
template <class TType, class R1, class R2, class R3,
135+
utils::CheckCmp<TType, double> = true>
136+
void alias_DIV(R1 r1, R2 r2, R3 r3) {
137+
vdivpd(r1, r2, r3);
138+
}
139+
140+
template <class TType, class R1, class R2, class R3,
141+
utils::CheckCmp<TType, float> = true>
142+
void alias_MUL(R1 r1, R2 r2, R3 r3) {
143+
vmulps(r1, r2, r3);
144+
}
145+
template <class TType, class R1, class R2, class R3,
146+
utils::CheckCmp<TType, double> = true>
147+
void alias_MUL(R1 r1, R2 r2, R3 r3) {
148+
vmulpd(r1, r2, r3);
149+
}
150+
151+
template <class Operator,
152+
utils::Verify<Operator, ::dgl::aten::cpu::op::CopyLhs,
153+
supported_types> = true>
154+
void full_chunk_loop_operations() {
155+
typedef typename Operator::type IType;
156+
alias_load<IType>(zmm0, ptr[r_out_ + r9 * sizeof(IType)]);
157+
alias_load<IType>(zmm1, ptr[r_left_ + r9 * sizeof(IType)]);
158+
alias_ADD<IType>(zmm2, zmm0, zmm1);
159+
alias_save<IType>(ptr[r_out_ + r9 * sizeof(IType)], zmm2);
160+
}
161+
template <class Operator,
162+
utils::Verify<Operator, ::dgl::aten::cpu::op::CopyRhs,
163+
supported_types> = true>
164+
void full_chunk_loop_operations() {
165+
typedef typename Operator::type IType;
166+
alias_load<IType>(zmm0, ptr[r_out_ + r9 * sizeof(IType)]);
167+
alias_load<IType>(zmm1, ptr[r_right + r9 * sizeof(IType)]);
168+
alias_ADD<IType>(zmm2, zmm0, zmm1);
169+
alias_save<IType>(ptr[r_out_ + r9 * sizeof(IType)], zmm2);
170+
}
171+
template <class T>
172+
void loop_pre() {
173+
alias_load<T>(zmm0, ptr[r_out_ + r9 * sizeof(T)]);
174+
alias_load<T>(zmm1, ptr[r_left_ + r9 * sizeof(T)]);
175+
alias_load<T>(zmm2, ptr[r_right + r9 * sizeof(T)]);
176+
}
177+
template <class T>
178+
void loop_post() {
179+
alias_ADD<T>(zmm2, zmm0, zmm2);
180+
alias_save<T>(ptr[r_out_ + r9 * sizeof(T)], zmm2);
181+
}
182+
template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Add,
183+
supported_types> = true>
184+
void full_chunk_loop_operations() {
185+
typedef typename Operator::type IType;
186+
loop_pre<IType>();
187+
alias_ADD<IType>(zmm2, zmm1, zmm2);
188+
loop_post<IType>();
189+
}
190+
template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Sub,
191+
supported_types> = true>
192+
void full_chunk_loop_operations() {
193+
typedef typename Operator::type IType;
194+
loop_pre<IType>();
195+
alias_SUB<IType>(zmm2, zmm1, zmm2);
196+
loop_post<IType>();
197+
}
198+
199+
template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Div,
200+
supported_types> = true>
201+
void full_chunk_loop_operations() {
202+
typedef typename Operator::type IType;
203+
loop_pre<IType>();
204+
alias_DIV<IType>(zmm2, zmm1, zmm2);
205+
loop_post<IType>();
206+
}
207+
208+
template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Mul,
209+
supported_types> = true>
210+
void full_chunk_loop_operations() {
211+
typedef typename Operator::type IType;
212+
loop_pre<IType>();
213+
alias_MUL<IType>(zmm2, zmm1, zmm2);
214+
loop_post<IType>();
215+
}
216+
217+
template <class Operator,
218+
utils::Verify<Operator, ::dgl::aten::cpu::op::CopyLhs,
219+
supported_types> = true>
220+
void remainder_operations(const Xbyak::Opmask mask) {
221+
typedef typename Operator::type IType;
222+
alias_load<IType>(make_zmm(zmm2) | mask, ptr[r_left_ + r9 * sizeof(IType)]);
223+
}
224+
225+
template <class Operator,
226+
utils::Verify<Operator, ::dgl::aten::cpu::op::CopyRhs,
227+
supported_types> = true>
228+
void remainder_operations(const Xbyak::Opmask mask) {
229+
typedef typename Operator::type IType;
230+
alias_load<IType>(make_zmm(zmm2) | mask, ptr[r_right + r9 * sizeof(IType)]);
231+
}
232+
233+
template <class T>
234+
void remainder_fetch_LR(const Xbyak::Opmask mask) {
235+
alias_load<T>(make_zmm(zmm2) | mask, ptr[r_left_ + r9 * sizeof(T)]);
236+
alias_load<T>(make_zmm(zmm1) | mask, ptr[r_right + r9 * sizeof(T)]);
237+
}
238+
239+
template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Mul,
240+
supported_types> = true>
241+
void remainder_operations(const Xbyak::Opmask mask) {
242+
typedef typename Operator::type IType;
243+
remainder_fetch_LR<IType>(mask);
244+
alias_MUL<IType>(zmm2, zmm2, zmm1);
245+
}
246+
247+
template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Add,
248+
supported_types> = true>
249+
void remainder_operations(const Xbyak::Opmask mask) {
250+
typedef typename Operator::type IType;
251+
remainder_fetch_LR<IType>(mask);
252+
alias_ADD<DType>(zmm2, zmm2, zmm1);
253+
}
254+
255+
template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Div,
256+
supported_types> = true>
257+
void remainder_operations(const Xbyak::Opmask mask) {
258+
typedef typename Operator::type IType;
259+
remainder_fetch_LR<IType>(mask);
260+
alias_DIV<DType>(zmm2, zmm2, zmm1);
261+
}
262+
263+
template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Sub,
264+
supported_types> = true>
265+
void remainder_operations(const Xbyak::Opmask mask) {
266+
typedef typename Operator::type IType;
267+
remainder_fetch_LR<IType>(mask);
268+
alias_SUB<DType>(zmm2, zmm2, zmm1);
269+
}
270+
271+
ElemWiseAddUpdate()
272+
: r_out_(rdi),
273+
r_left_(rsi),
274+
r_right(rdx),
275+
r_size_(rcx),
276+
applicable_(false) {
277+
static Xbyak::util::Cpu current_cpu;
278+
279+
/* Default case for all */
280+
if (current_cpu.has(Xbyak::util::Cpu::tAVX512F)) {
281+
/* prepare REMAINDER */
282+
mov(r8, r_size_);
283+
and_(r8,
284+
UNIT_PER_REG - 1); // r8_modulo = size/(sizeof(zmm)/sizeof(float))
285+
xor_(r9, r9); // reset r9
286+
cmp(r_size_, UNIT_PER_REG); // if ( size < 16 ) { }
287+
jl("remainder");
288+
289+
/* decrease divident */
290+
sub(r_size_, r8); // prepare alignment chunks
291+
cmp(r_size_, 0); // do we have any full chunks ?
292+
jz("remainder");
293+
294+
L("for_i");
295+
full_chunk_loop_operations<Op>();
296+
add(r9, UNIT_PER_REG); // r9+=sizeof(zmm)/sizeof(float)
297+
cmp(r_size_, r9); // more full chunks ?
298+
jnz("for_i");
299+
300+
L("remainder");
301+
cmp(r8, 0); // do we have a remainder ?
302+
jz("done");
303+
/* prepare a bitmask for k1 */
304+
mov(rax, 1);
305+
mov(r_size_, r8);
306+
sal(rax, cl);
307+
dec(rax); // k1= (1 << r8 )-1
308+
kmovw(k1, eax); // set bitmask
309+
alias_load<DType>(make_zmm(zmm0) | k1,
310+
ptr[r_out_ + r9 * UNIT_SIZE_BYTES]);
311+
remainder_operations<Op>(k1);
312+
alias_ADD<DType>(zmm3, zmm2, zmm0);
313+
alias_save<DType>(ptr[r_out_ + r9 * UNIT_SIZE_BYTES],
314+
make_zmm(zmm3) | k1);
315+
L("done");
316+
applicable_ = true;
317+
log_intel("AVX512F cpu kernel is ready");
318+
}
319+
ret();
320+
}
321+
322+
bool applicable() const { return applicable_; }
323+
324+
template <class... P>
325+
void run(P... args) {
326+
((void (*)(P...))(this)->getCode())(args...);
327+
}
328+
};
329+
330+
} // namespace dgl
331+
332+
#endif // INTEL_CPU_SUPPORT_H_

0 commit comments

Comments
 (0)