-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathregrid.cl
275 lines (254 loc) · 10.4 KB
/
regrid.cl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
//Storage for that many voxel per pixel
#define STORAGE_SIZE 64
// Function to perform an atom addition in global memory (does not exist in OpenCL)
inline void atomic_add_global_float(volatile global float *addr, float val)
{
union {
uint u32;
float f32;
} next, expected, current;
current.f32 = *addr;
do {
expected.f32 = current.f32;
next.f32 = expected.f32 + val;
current.u32 = atomic_cmpxchg( (volatile global uint *)addr,
expected.u32, next.u32);
} while( current.u32 != expected.u32 );
}
// Performs the centering/scaling in the detector plane. Image flipping must be implemented here
float2 inline calc_position_real(float2 index,
float2 center,
float pixel_size)
{
return (index - center) * pixel_size;
}
// Transforms a 2D position in the image into a 3D coordinate in the volume
float3 inline calc_position_rec(float2 index,
float2 center,
float pixel_size,
float distance,
float3 Rx,
float3 Ry,
float3 Rz)
{
float2 pos2 = calc_position_real(index, center, pixel_size);
// float d = sqrt(distance*distance + dot(pos2, pos2));
// float d = fast_length((float3)(distance, pos2));
float inv_d = rsqrt(distance*distance + dot(pos2, pos2));
float3 pos3 = (float3)(pos2.x*inv_d, pos2.y*inv_d, distance*inv_d-1.0f);
float scale = distance/pixel_size;
return scale * (float3)(dot(Rx, pos3), dot(Ry, pos3), dot(Rz, pos3));
}
/* Kernel: regid_CDI_slaby
* Regrid an image (height x width) to a 3D volume (shape³)
*
* In this kernel populates a slab along y (x and z have the full size)
* with a subset of the image, one horizontal band from y_start to y_end
*
* Slabed, store only data starting at slab_start <= y < slab_end
*
* Nota, some pixels will overlap at the slab boundary, it is advised to send a
* couple of extra lines of the image rather than missing part of the image.
*
* param image: 2D image. Only a limited number of lines by be given by y_start, y_end
* param mask: 2D mask, non zero values are ignored. The full mask is always used
* param height: the height of the full image and of the mask
* param width: the width of the image
* param y_start: the starting line number of the image 0<=y_start<height
* param y_end: the last line number of the image 0<=y_start<y_end<=height
* param pixel size: the size of one pixel in the image, assumed square, in meter
* param distance: the sample to detector distance in meter
* param phi: the rotation angle of the sample along the y-axis of the image
* param dphi: the step size of the rotation (in case of continuous rotation)
* param center_y: the coordinate of the beam center on the image along y
* param center_x: the coordinate of the beam center on the image along x
* param scale: Zooming out factor, a scale of one matches one pixel to one voxel, for 2, 2x2 pixels match to 1 voxel
* param signal: the output slab used for accumulating all signal, which size is: (shape, slaby_end-slaby_start, shape)
* param norm: the normalization array for the slab, same shape.
* param shape: size of the final volume (in each direction)
* param slaby_start: the index of begining of slab 0<= slaby_start<slaby_end <=shape
* param slaby_end: the index of end of slab: slab 0<= slaby_start<slaby_end <=shape
* param oversampling_pixel: split each pixel into a t*t sub pixel and project the according number of times. Smoother results
* param oversampling_phi: project each frame that many times beteween phi and phi+dphi. Exact only in continuous/oscilation mode !
*
* Workgroup size policy:
* 1 work-item per pixel
* Dimention 0 is x
* Dimention 1 is y (as in cuda !)
*
* The workgroup should be small (due to large register usage)
* and aligned along x which gives:
*
* global size: (width, heigth)
* workgroup size:(32, 1)
* assuming cuda device.
*
*/
kernel void regid_CDI_slaby(global float* image,
global uchar* mask,
const int height,
const int width,
const int y_start,
const int y_end,
const float pixel_size,
const float distance,
const float phi,
float dphi,
const float center_y,
const float center_x,
const float scale,
global float* signal,
global int* norm,
const int shape,
const int slaby_size,
const int slaby_start,
const int slaby_end,
const int oversampling_pixel,
const int oversampling_phi)
{
int tmp, shape_2, i, j, k, x, y_local, y_global;
ulong where_in, where_out;
float value;
float2 pos2, center = (float2)(center_x, center_y);
float3 Rx, Ry, Rz, recip;
const float D2R = M_PI_F/180.0f;
const float iscale = 1.0f/scale;
//This is local storage of voxels to be written
int last=0;
ulong index[STORAGE_SIZE]; // store the position in the slab/volume
float store_s[STORAGE_SIZE]; // store the signal
uint store_n[STORAGE_SIZE]; // store the normalization
//This is a convention, be aware when launching the kernel !
x = get_global_id(0);
y_local = get_global_id(1);
y_global = y_local + y_start;
where_in = width*y_local + x;
shape_2 = shape/2;
// Don't be stupid!
// oversampling_pixel = (oversampling_pixel<1?1:oversampling_pixel);
// oversampling_phi = (oversampling_phi<1?1:oversampling_phi);
const float delta_pix = 1.0f / oversampling_pixel;
const float delta_phi = dphi / oversampling_phi;
const float start_pix = -0.5f * (oversampling_pixel-1);
const float start_phi = -0.5f * (oversampling_phi-1);
{ //Manual mask definition
if ((x >= width) ||
(y_global >= height) ||
(y_global >= y_end))
return;
}
{ // static mask
if (mask[where_in + width*y_start])
return;
}
value = image[where_in];
{//dynamic masking
if (! isfinite(value))
return;
}
//Basic oversampling
for (int dr=0; dr<oversampling_phi; dr++)
{
float cos_phi, sin_phi, rphi;
rphi = (phi + (start_phi + dr)*delta_phi) * D2R;
cos_phi = cos(rphi);
sin_phi = sin(rphi);
Rx = (float3)(cos_phi, 0.0f, sin_phi);
Ry = (float3)(0.0f, 1.0f, 0.0f);
Rz = (float3)(-sin_phi, 0.0f, cos_phi);
for (i=0; i<oversampling_pixel; i++)
{
for (j=0; j<oversampling_pixel; j++)
{
pos2 = (float2)(x + (i + start_pix)*delta_pix,
y_global + (j + start_pix)*delta_pix);
recip = calc_position_rec(pos2, center, pixel_size, distance, Rx, Ry, Rz)*iscale;
tmp = convert_int_rtn(recip.x + 0.5f) + shape_2;
if ((tmp>=0) && (tmp<shape))
{
where_out = tmp;
tmp = convert_int_rtn(recip.y + 0.5f) + shape_2;
if ((tmp>=slaby_start) && (tmp<slaby_end))
{
where_out += (tmp-slaby_start) * shape;
tmp = convert_int_rtn(recip.z + 0.5f) + shape_2;
if ((tmp>=0) && (tmp<shape))
{
where_out += (ulong) tmp * shape * slaby_size;
//storage locally
int found = 0;
for (k=0; k<last; k++)
{
if (where_out == index[k])
{
store_s[k] += value;
store_n[k] += 1;
found = 1;
k = last;
}
}
if (found == 0)
{
if (last >= STORAGE_SIZE)
printf("Too many voxels covered by pixel\n");
else
{
index[last] = where_out;
store_s[last] = value;
store_n[last] = 1;
last++;
}
}
}
}
}
}
}
}
// Finally we update the global memory with atomic writes
for (k=0; k<last; k++)
{
atomic_add_global_float(&signal[index[k]], store_s[k]);
atomic_add(&norm[index[k]], store_n[k]);
}
}
/* Normalization kernel to be called at the very end of the processing
*
* Basically it performs an inplace normalization:
*
* signal /= norm
*
* In this kernel: size = slab_size * shape_1 * shape_2 and is uint64!
*
* WG hint: 1D with the largest workgroup size possible
*/
kernel void normalize_signal(global float* signal,
global int* norm,
const ulong size)
{
ulong idx = get_global_id(0);
if (idx<size)
{
signal[idx] /= (float) norm[idx];
}
}
/* Memset kernel called to empty the arrays
*
* signal[...] = 0.0f
* norm[...] = 0
*
* In this kernel: size = slab_size * shape_1 * shape_2 and is uint64!
*
* WG hint: 1D with the largest workgroup size possible
*/
kernel void memset_signal(global float* signal,
global int* norm,
const ulong size)
{
ulong idx = get_global_id(0);
if (idx<size)
{
signal[idx] = 0.0f;
norm[idx] = 0;
}
}