Skip to content

Commit cfdddec

Browse files
committed
avfilter/scale_cuda: add lanczos algorithm
1 parent 98d3f23 commit cfdddec

File tree

4 files changed

+77
-17
lines changed

4 files changed

+77
-17
lines changed

compat/cuda/cuda_runtime.h

+3
Original file line numberDiff line numberDiff line change
@@ -182,4 +182,7 @@ static inline __device__ float fabsf(float a) { return __builtin_fabsf(a); }
182182
static inline __device__ float fabs(float a) { return __builtin_fabsf(a); }
183183
static inline __device__ double fabs(double a) { return __builtin_fabs(a); }
184184

185+
static inline __device__ float __sinf(float a) { return __nvvm_sin_approx_f(a); }
186+
static inline __device__ float __cosf(float a) { return __nvvm_cos_approx_f(a); }
187+
185188
#endif /* COMPAT_CUDA_CUDA_RUNTIME_H */

libavfilter/version.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131

3232
#define LIBAVFILTER_VERSION_MAJOR 7
3333
#define LIBAVFILTER_VERSION_MINOR 88
34-
#define LIBAVFILTER_VERSION_MICRO 101
34+
#define LIBAVFILTER_VERSION_MICRO 102
3535

3636

3737
#define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \

libavfilter/vf_scale_cuda.c

+8
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ enum {
5959
INTERP_ALGO_NEAREST,
6060
INTERP_ALGO_BILINEAR,
6161
INTERP_ALGO_BICUBIC,
62+
INTERP_ALGO_LANCZOS,
6263

6364
INTERP_ALGO_COUNT
6465
};
@@ -293,6 +294,12 @@ static av_cold int cudascale_config_props(AVFilterLink *outlink)
293294
s->interp_use_linear = 0;
294295
s->interp_as_integer = 0;
295296
break;
297+
case INTERP_ALGO_LANCZOS:
298+
scaler_ptx = vf_scale_cuda_bicubic_ptx;
299+
function_infix = "_Lanczos";
300+
s->interp_use_linear = 0;
301+
s->interp_as_integer = 0;
302+
break;
296303
default:
297304
av_log(ctx, AV_LOG_ERROR, "Unknown interpolation algorithm\n");
298305
return AVERROR_BUG;
@@ -601,6 +608,7 @@ static const AVOption options[] = {
601608
{ "nearest", "nearest neighbour", 0, AV_OPT_TYPE_CONST, { .i64 = INTERP_ALGO_NEAREST }, 0, 0, FLAGS, "interp_algo" },
602609
{ "bilinear", "bilinear", 0, AV_OPT_TYPE_CONST, { .i64 = INTERP_ALGO_BILINEAR }, 0, 0, FLAGS, "interp_algo" },
603610
{ "bicubic", "bicubic", 0, AV_OPT_TYPE_CONST, { .i64 = INTERP_ALGO_BICUBIC }, 0, 0, FLAGS, "interp_algo" },
611+
{ "lanczos", "lanczos", 0, AV_OPT_TYPE_CONST, { .i64 = INTERP_ALGO_LANCZOS }, 0, 0, FLAGS, "interp_algo" },
604612
{ "passthrough", "Do not process frames at all if parameters match", OFFSET(passthrough), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, FLAGS },
605613
{ "force_original_aspect_ratio", "decrease or increase w/h if necessary to keep the original AR", OFFSET(force_original_aspect_ratio), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 2, FLAGS, "force_oar" },
606614
{ "disable", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 0 }, 0, 0, FLAGS, "force_oar" },

libavfilter/vf_scale_cuda_bicubic.cu

+65-16
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,30 @@
2222

2323
#include "cuda/vector_helpers.cuh"
2424

25+
typedef float4 (*coeffs_function_t)(float);
26+
27+
__device__ inline float4 lanczos_coeffs(float x)
28+
{
29+
const float pi = 3.141592654f;
30+
31+
float4 res = make_float4(
32+
pi * (x + 1),
33+
pi * x,
34+
pi * (x - 1),
35+
pi * (x - 2));
36+
37+
res.x = res.x == 0.0f ? 1.0f :
38+
__sinf(res.x) * __sinf(res.x / 2.0f) / (res.x * res.x / 2.0f);
39+
res.y = res.y == 0.0f ? 1.0f :
40+
__sinf(res.y) * __sinf(res.y / 2.0f) / (res.y * res.y / 2.0f);
41+
res.z = res.z == 0.0f ? 1.0f :
42+
__sinf(res.z) * __sinf(res.z / 2.0f) / (res.z * res.z / 2.0f);
43+
res.w = res.w == 0.0f ? 1.0f :
44+
__sinf(res.w) * __sinf(res.w / 2.0f) / (res.w * res.w / 2.0f);
45+
46+
return res / (res.x + res.y + res.z + res.w);
47+
}
48+
2549
__device__ inline float4 bicubic_coeffs(float x)
2650
{
2751
const float A = -0.75f;
@@ -35,10 +59,8 @@ __device__ inline float4 bicubic_coeffs(float x)
3559
return res;
3660
}
3761

38-
__device__ inline void bicubic_fast_coeffs(float x, float *h0, float *h1, float *s)
62+
__device__ inline void derived_fast_coeffs(float4 coeffs, float x, float *h0, float *h1, float *s)
3963
{
40-
float4 coeffs = bicubic_coeffs(x);
41-
4264
float g0 = coeffs.x + coeffs.y;
4365
float g1 = coeffs.z + coeffs.w;
4466

@@ -48,7 +70,7 @@ __device__ inline void bicubic_fast_coeffs(float x, float *h0, float *h1, float
4870
}
4971

5072
template<typename V>
51-
__device__ inline V bicubic_filter(float4 coeffs, V c0, V c1, V c2, V c3)
73+
__device__ inline V apply_coeffs(float4 coeffs, V c0, V c1, V c2, V c3)
5274
{
5375
V res = c0 * coeffs.x;
5476
res += c1 * coeffs.y;
@@ -59,7 +81,8 @@ __device__ inline V bicubic_filter(float4 coeffs, V c0, V c1, V c2, V c3)
5981
}
6082

6183
template<typename T>
62-
__device__ inline void Subsample_Bicubic(cudaTextureObject_t src_tex,
84+
__device__ inline void Subsample_Bicubic(coeffs_function_t coeffs_function,
85+
cudaTextureObject_t src_tex,
6386
T *dst,
6487
int dst_width, int dst_height, int dst_pitch,
6588
int src_width, int src_height,
@@ -81,17 +104,17 @@ __device__ inline void Subsample_Bicubic(cudaTextureObject_t src_tex,
81104

82105
float factor = bit_depth > 8 ? 0xFFFF : 0xFF;
83106

84-
float4 coeffsX = bicubic_coeffs(fx);
85-
float4 coeffsY = bicubic_coeffs(fy);
107+
float4 coeffsX = coeffs_function(fx);
108+
float4 coeffsY = coeffs_function(fy);
86109

87110
#define PIX(x, y) tex2D<floatT>(src_tex, (x), (y))
88111

89112
dst[yo * dst_pitch + xo] = from_floatN<T, floatT>(
90-
bicubic_filter<floatT>(coeffsY,
91-
bicubic_filter<floatT>(coeffsX, PIX(px - 1, py - 1), PIX(px, py - 1), PIX(px + 1, py - 1), PIX(px + 2, py - 1)),
92-
bicubic_filter<floatT>(coeffsX, PIX(px - 1, py ), PIX(px, py ), PIX(px + 1, py ), PIX(px + 2, py )),
93-
bicubic_filter<floatT>(coeffsX, PIX(px - 1, py + 1), PIX(px, py + 1), PIX(px + 1, py + 1), PIX(px + 2, py + 1)),
94-
bicubic_filter<floatT>(coeffsX, PIX(px - 1, py + 2), PIX(px, py + 2), PIX(px + 1, py + 2), PIX(px + 2, py + 2))
113+
apply_coeffs<floatT>(coeffsY,
114+
apply_coeffs<floatT>(coeffsX, PIX(px - 1, py - 1), PIX(px, py - 1), PIX(px + 1, py - 1), PIX(px + 2, py - 1)),
115+
apply_coeffs<floatT>(coeffsX, PIX(px - 1, py ), PIX(px, py ), PIX(px + 1, py ), PIX(px + 2, py )),
116+
apply_coeffs<floatT>(coeffsX, PIX(px - 1, py + 1), PIX(px, py + 1), PIX(px + 1, py + 1), PIX(px + 2, py + 1)),
117+
apply_coeffs<floatT>(coeffsX, PIX(px - 1, py + 2), PIX(px, py + 2), PIX(px + 1, py + 2), PIX(px + 2, py + 2))
95118
) * factor
96119
);
97120

@@ -101,7 +124,8 @@ __device__ inline void Subsample_Bicubic(cudaTextureObject_t src_tex,
101124

102125
/* This does not yield correct results. Most likely because of low internal precision in tex2D linear interpolation */
103126
template<typename T>
104-
__device__ inline void Subsample_FastBicubic(cudaTextureObject_t src_tex,
127+
__device__ inline void Subsample_FastBicubic(coeffs_function_t coeffs_function,
128+
cudaTextureObject_t src_tex,
105129
T *dst,
106130
int dst_width, int dst_height, int dst_pitch,
107131
int src_width, int src_height,
@@ -123,10 +147,13 @@ __device__ inline void Subsample_FastBicubic(cudaTextureObject_t src_tex,
123147

124148
float factor = bit_depth > 8 ? 0xFFFF : 0xFF;
125149

150+
float4 coeffsX = coeffs_function(fx);
151+
float4 coeffsY = coeffs_function(fy);
152+
126153
float h0x, h1x, sx;
127154
float h0y, h1y, sy;
128-
bicubic_fast_coeffs(fx, &h0x, &h1x, &sx);
129-
bicubic_fast_coeffs(fy, &h0y, &h1y, &sy);
155+
derived_fast_coeffs(coeffsX, fx, &h0x, &h1x, &sx);
156+
derived_fast_coeffs(coeffsY, fy, &h0y, &h1y, &sy);
130157

131158
#define PIX(x, y) tex2D<floatT>(src_tex, (x), (y))
132159

@@ -157,7 +184,7 @@ extern "C" {
157184
int src_width, int src_height, \
158185
int bit_depth) \
159186
{ \
160-
Subsample_Bicubic<T>(src_tex, dst, \
187+
Subsample_Bicubic<T>(&bicubic_coeffs, src_tex, dst, \
161188
dst_width, dst_height, dst_pitch, \
162189
src_width, src_height, \
163190
bit_depth); \
@@ -171,4 +198,26 @@ BICUBIC_KERNEL(ushort)
171198
BICUBIC_KERNEL(ushort2)
172199
BICUBIC_KERNEL(ushort4)
173200

201+
202+
#define LANCZOS_KERNEL(T) \
203+
__global__ void Subsample_Lanczos_ ## T(cudaTextureObject_t src_tex, \
204+
T *dst, \
205+
int dst_width, int dst_height, int dst_pitch, \
206+
int src_width, int src_height, \
207+
int bit_depth) \
208+
{ \
209+
Subsample_Bicubic<T>(&lanczos_coeffs, src_tex, dst, \
210+
dst_width, dst_height, dst_pitch, \
211+
src_width, src_height, \
212+
bit_depth); \
213+
}
214+
215+
LANCZOS_KERNEL(uchar)
216+
LANCZOS_KERNEL(uchar2)
217+
LANCZOS_KERNEL(uchar4)
218+
219+
LANCZOS_KERNEL(ushort)
220+
LANCZOS_KERNEL(ushort2)
221+
LANCZOS_KERNEL(ushort4)
222+
174223
}

0 commit comments

Comments
 (0)