-
Notifications
You must be signed in to change notification settings - Fork 24
/
private.h
154 lines (130 loc) · 5.2 KB
/
private.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#ifndef MHCUDA_PRIVATE_H
#define MHCUDA_PRIVATE_H
#include "minhashcuda.h"
#include <cmath>
#include <cstdio>
#include <tuple>
#include "wrappers.h"
#define INFO(...) do { if (verbosity > 0) { printf(__VA_ARGS__); } } while (false)
#define DEBUG(...) do { if (verbosity > 1) { printf(__VA_ARGS__); } } while (false)
#define CUERRSTR() cudaGetErrorString(cudaGetLastError())
#define CUCH(cuda_call, ret, ...) \
do { \
auto __res = cuda_call; \
if (__res != 0) { \
DEBUG("%s\n", #cuda_call); \
INFO("%s:%d -> %s\n", __FILE__, __LINE__, cudaGetErrorString(__res)); \
__VA_ARGS__; \
return ret; \
} \
} while (false)
#define RETERR(call, ...) \
do { \
auto __res = call; \
if (__res != 0) { \
__VA_ARGS__; \
return __res; \
} \
} while (false)
#define FOR_EACH_DEV(...) do { for (int dev : devs) { \
CUCH(cudaSetDevice(dev), mhcudaNoSuchDevice); \
__VA_ARGS__; \
} } while(false)
#define FOR_EACH_DEVI(...) do { for (size_t devi = 0; devi < devs.size(); devi++) { \
CUCH(cudaSetDevice(devs[devi]), mhcudaNoSuchDevice); \
__VA_ARGS__; \
} } while(false)
#define SYNC_ALL_DEVS do { \
if (devs.size() > 1) { \
FOR_EACH_DEV(CUCH(cudaDeviceSynchronize(), mhcudaRuntimeError)); \
} } while (false)
#define CUMEMCPY_D2H_ASYNC(dst, dst_stride, src, src_offset, size) do { \
FOR_EACH_DEVI(CUCH(cudaMemcpyAsync( \
dst + dst_stride * devi, (src)[devi].get() + src_offset, \
(size) * sizeof(std::remove_reference<decltype(src)>::type::value_type \
::element_type), \
cudaMemcpyDeviceToHost), \
mhcudaMemoryCopyError)); \
} while(false)
#define CUMEMCPY_D2H(dst, src, size) do { \
CUMEMCPY_D2H_ASYNC(dst, src, size); \
FOR_EACH_DEV(CUCH(cudaDeviceSynchronize(), mhcudaMemoryCopyError)); \
} while(false)
#define CUMEMCPY_H2D_ASYNC(dst, dst_offset, src, size) do { \
FOR_EACH_DEVI(CUCH(cudaMemcpyAsync( \
(dst)[devi].get() + dst_offset, src, \
(size) * sizeof(std::remove_reference<decltype(dst)>::type::value_type \
::element_type), \
cudaMemcpyHostToDevice), \
mhcudaMemoryCopyError)); \
} while(false)
#define CUMEMCPY_H2D(dst, src, size) do { \
CUMEMCPY_H2D_ASYNC(dst, src, size); \
FOR_EACH_DEV(CUCH(cudaDeviceSynchronize(), mhcudaMemoryCopyError)); \
} while(false)
#define CUMEMCPY_D2D_ASYNC(dst, dst_offset, src, src_offset, size) do { \
FOR_EACH_DEVI(CUCH(cudaMemcpyAsync( \
(dst)[devi].get() + dst_offset, (src)[devi].get() + src_offset, \
(size) * sizeof(std::remove_reference<decltype(dst)>::type::value_type \
::element_type), \
cudaMemcpyDeviceToDevice), \
mhcudaMemoryCopyError)); \
} while(false)
#define CUMEMCPY_D2D(dst, dst_offset, src, src_offset, size) do { \
CUMEMCPY_D2D_ASYNC(dst, dst_offset, src, src_offset, size); \
FOR_EACH_DEV(CUCH(cudaDeviceSynchronize(), mhcudaMemoryCopyError)); \
} while(false)
#define CUMALLOC_ONEN(dest, size, name) do { \
void *__ptr; \
CUCH(cudaMalloc( \
&__ptr, \
(size) * sizeof(std::remove_reference<decltype(dest)>::type::value_type \
::element_type)), \
mhcudaMemoryAllocationFailure, \
INFO("failed to allocate %zu bytes for " name "\n", \
static_cast<size_t>(size))); \
(dest).emplace_back(reinterpret_cast<std::remove_reference<decltype(dest)> \
::type::value_type::element_type *>(__ptr)); \
} while(false)
#define CUMALLOC_ONE(dest, size) CUMALLOC_ONEN(dest, size, #dest)
#define CUMALLOCN(dest, size, name) do { \
FOR_EACH_DEV(CUMALLOC_ONEN(dest, size, name)); \
} while(false)
#define CUMALLOC(dest, size) CUMALLOCN(dest, size, #dest)
#define CUMEMSET(dst, val, size) do { \
FOR_EACH_DEVI(CUCH(cudaMemsetAsync( \
(dst)[devi].get(), val, \
size * sizeof(std::remove_reference<decltype(dst)>::type::value_type::element_type)), \
mhcudaRuntimeError)); \
FOR_EACH_DEV(CUCH(cudaDeviceSynchronize(), mhcudaRuntimeError)); \
} while(false)
#define FOR_OTHER_DEVS(...) do { \
for (size_t odevi = 0; odevi < devs.size(); odevi++) { \
if (odevi == devi) { \
continue; \
} \
__VA_ARGS__; \
} } while(false)
#define CUP2P(what, offset, size) do { \
CUCH(cudaMemcpyPeerAsync( \
(*what)[odevi].get() + offset, devs[odevi], (*what)[devi].get() + offset, \
devs[devi], (size) * sizeof(std::remove_reference<decltype(*what)>::type \
::value_type::element_type)), \
mhcudaMemoryCopyError); \
} while(false)
extern "C" {
cudaError_t gamma_(uint32_t size, const float *v1, float *v2);
cudaError_t log_(uint32_t size, float *v);
MHCUDAResult setup_weighted_minhash(
uint32_t dim, const std::vector<int> &devs, int verbosity);
MHCUDAResult weighted_minhash(
const udevptrs<float> &rs, const udevptrs<float> &ln_cs,
const udevptrs<float> &betas, const udevptrs<float> &weights,
const udevptrs<uint32_t> &cols, const udevptrs<uint32_t> &rows,
int samples, const std::vector<int> &sample_delta,
const udevptrs<int32_t> &plan, const std::vector<uint32_t> &split,
const uint32_t *original_rows, const std::vector<uint32_t> &grid_sizes,
const std::vector<int> &devs, int verbosity, udevptrs<uint32_t> *hashes);
}
#define MINHASH_BLOCK_SIZE 512
#endif // MHCUDA_PRIVATE_H