22
22
23
23
#include " cuda/vector_helpers.cuh"
24
24
25
+ typedef float4 (*coeffs_function_t )(float );
26
+
27
+ __device__ inline float4 lanczos_coeffs (float x)
28
+ {
29
+ const float pi = 3 .141592654f ;
30
+
31
+ float4 res = make_float4 (
32
+ pi * (x + 1 ),
33
+ pi * x,
34
+ pi * (x - 1 ),
35
+ pi * (x - 2 ));
36
+
37
+ res.x = res.x == 0 .0f ? 1 .0f :
38
+ __sinf (res.x ) * __sinf (res.x / 2 .0f ) / (res.x * res.x / 2 .0f );
39
+ res.y = res.y == 0 .0f ? 1 .0f :
40
+ __sinf (res.y ) * __sinf (res.y / 2 .0f ) / (res.y * res.y / 2 .0f );
41
+ res.z = res.z == 0 .0f ? 1 .0f :
42
+ __sinf (res.z ) * __sinf (res.z / 2 .0f ) / (res.z * res.z / 2 .0f );
43
+ res.w = res.w == 0 .0f ? 1 .0f :
44
+ __sinf (res.w ) * __sinf (res.w / 2 .0f ) / (res.w * res.w / 2 .0f );
45
+
46
+ return res / (res.x + res.y + res.z + res.w );
47
+ }
48
+
25
49
__device__ inline float4 bicubic_coeffs (float x)
26
50
{
27
51
const float A = -0 .75f ;
@@ -35,10 +59,8 @@ __device__ inline float4 bicubic_coeffs(float x)
35
59
return res;
36
60
}
37
61
38
- __device__ inline void bicubic_fast_coeffs ( float x, float *h0, float *h1, float *s)
62
+ __device__ inline void derived_fast_coeffs ( float4 coeffs, float x, float *h0, float *h1, float *s)
39
63
{
40
- float4 coeffs = bicubic_coeffs (x);
41
-
42
64
float g0 = coeffs.x + coeffs.y ;
43
65
float g1 = coeffs.z + coeffs.w ;
44
66
@@ -48,7 +70,7 @@ __device__ inline void bicubic_fast_coeffs(float x, float *h0, float *h1, float
48
70
}
49
71
50
72
template <typename V>
51
- __device__ inline V bicubic_filter (float4 coeffs, V c0, V c1, V c2, V c3)
73
+ __device__ inline V apply_coeffs (float4 coeffs, V c0, V c1, V c2, V c3)
52
74
{
53
75
V res = c0 * coeffs.x ;
54
76
res += c1 * coeffs.y ;
@@ -59,7 +81,8 @@ __device__ inline V bicubic_filter(float4 coeffs, V c0, V c1, V c2, V c3)
59
81
}
60
82
61
83
template <typename T>
62
- __device__ inline void Subsample_Bicubic (cudaTextureObject_t src_tex,
84
+ __device__ inline void Subsample_Bicubic (coeffs_function_t coeffs_function,
85
+ cudaTextureObject_t src_tex,
63
86
T *dst,
64
87
int dst_width, int dst_height, int dst_pitch,
65
88
int src_width, int src_height,
@@ -81,17 +104,17 @@ __device__ inline void Subsample_Bicubic(cudaTextureObject_t src_tex,
81
104
82
105
float factor = bit_depth > 8 ? 0xFFFF : 0xFF ;
83
106
84
- float4 coeffsX = bicubic_coeffs (fx);
85
- float4 coeffsY = bicubic_coeffs (fy);
107
+ float4 coeffsX = coeffs_function (fx);
108
+ float4 coeffsY = coeffs_function (fy);
86
109
87
110
#define PIX (x, y ) tex2D <floatT>(src_tex, (x), (y))
88
111
89
112
dst[yo * dst_pitch + xo] = from_floatN<T, floatT>(
90
- bicubic_filter <floatT>(coeffsY,
91
- bicubic_filter <floatT>(coeffsX, PIX (px - 1 , py - 1 ), PIX (px, py - 1 ), PIX (px + 1 , py - 1 ), PIX (px + 2 , py - 1 )),
92
- bicubic_filter <floatT>(coeffsX, PIX (px - 1 , py ), PIX (px, py ), PIX (px + 1 , py ), PIX (px + 2 , py )),
93
- bicubic_filter <floatT>(coeffsX, PIX (px - 1 , py + 1 ), PIX (px, py + 1 ), PIX (px + 1 , py + 1 ), PIX (px + 2 , py + 1 )),
94
- bicubic_filter <floatT>(coeffsX, PIX (px - 1 , py + 2 ), PIX (px, py + 2 ), PIX (px + 1 , py + 2 ), PIX (px + 2 , py + 2 ))
113
+ apply_coeffs <floatT>(coeffsY,
114
+ apply_coeffs <floatT>(coeffsX, PIX (px - 1 , py - 1 ), PIX (px, py - 1 ), PIX (px + 1 , py - 1 ), PIX (px + 2 , py - 1 )),
115
+ apply_coeffs <floatT>(coeffsX, PIX (px - 1 , py ), PIX (px, py ), PIX (px + 1 , py ), PIX (px + 2 , py )),
116
+ apply_coeffs <floatT>(coeffsX, PIX (px - 1 , py + 1 ), PIX (px, py + 1 ), PIX (px + 1 , py + 1 ), PIX (px + 2 , py + 1 )),
117
+ apply_coeffs <floatT>(coeffsX, PIX (px - 1 , py + 2 ), PIX (px, py + 2 ), PIX (px + 1 , py + 2 ), PIX (px + 2 , py + 2 ))
95
118
) * factor
96
119
);
97
120
@@ -101,7 +124,8 @@ __device__ inline void Subsample_Bicubic(cudaTextureObject_t src_tex,
101
124
102
125
/* This does not yield correct results. Most likely because of low internal precision in tex2D linear interpolation */
103
126
template <typename T>
104
- __device__ inline void Subsample_FastBicubic (cudaTextureObject_t src_tex,
127
+ __device__ inline void Subsample_FastBicubic (coeffs_function_t coeffs_function,
128
+ cudaTextureObject_t src_tex,
105
129
T *dst,
106
130
int dst_width, int dst_height, int dst_pitch,
107
131
int src_width, int src_height,
@@ -123,10 +147,13 @@ __device__ inline void Subsample_FastBicubic(cudaTextureObject_t src_tex,
123
147
124
148
float factor = bit_depth > 8 ? 0xFFFF : 0xFF ;
125
149
150
+ float4 coeffsX = coeffs_function (fx);
151
+ float4 coeffsY = coeffs_function (fy);
152
+
126
153
float h0x, h1x, sx;
127
154
float h0y, h1y, sy;
128
- bicubic_fast_coeffs ( fx, &h0x, &h1x, &sx);
129
- bicubic_fast_coeffs ( fy, &h0y, &h1y, &sy);
155
+ derived_fast_coeffs (coeffsX, fx, &h0x, &h1x, &sx);
156
+ derived_fast_coeffs (coeffsY, fy, &h0y, &h1y, &sy);
130
157
131
158
#define PIX (x, y ) tex2D <floatT>(src_tex, (x), (y))
132
159
@@ -157,7 +184,7 @@ extern "C" {
157
184
int src_width, int src_height, \
158
185
int bit_depth) \
159
186
{ \
160
- Subsample_Bicubic<T>(src_tex, dst, \
187
+ Subsample_Bicubic<T>(&bicubic_coeffs, src_tex, dst, \
161
188
dst_width, dst_height, dst_pitch, \
162
189
src_width, src_height, \
163
190
bit_depth); \
@@ -171,4 +198,26 @@ BICUBIC_KERNEL(ushort)
171
198
BICUBIC_KERNEL(ushort2 )
172
199
BICUBIC_KERNEL(ushort4 )
173
200
201
+
202
+ #define LANCZOS_KERNEL (T ) \
203
+ __global__ void Subsample_Lanczos_ ## T(cudaTextureObject_t src_tex, \
204
+ T *dst, \
205
+ int dst_width, int dst_height, int dst_pitch, \
206
+ int src_width, int src_height, \
207
+ int bit_depth) \
208
+ { \
209
+ Subsample_Bicubic<T>(&lanczos_coeffs, src_tex, dst, \
210
+ dst_width, dst_height, dst_pitch, \
211
+ src_width, src_height, \
212
+ bit_depth); \
213
+ }
214
+
215
+ LANCZOS_KERNEL (uchar)
216
+ LANCZOS_KERNEL(uchar2 )
217
+ LANCZOS_KERNEL(uchar4 )
218
+
219
+ LANCZOS_KERNEL(ushort )
220
+ LANCZOS_KERNEL(ushort2 )
221
+ LANCZOS_KERNEL(ushort4 )
222
+
174
223
}
0 commit comments