Skip to content

Commit

Permalink
added cpu optimization detection and avx optimization
Browse files Browse the repository at this point in the history
  • Loading branch information
manuvi committed Nov 14, 2023
1 parent a74ffd3 commit 6e0b494
Show file tree
Hide file tree
Showing 19 changed files with 286 additions and 58 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ clean:
rm -f common/*.d common/*.o
rm -f common/freeverb/*.d common/freeverb/*.o
rm -f common/kiss_fft/*.d common/kiss_fft/*.o
rm -f common/optimization/*.d common/optimization/*.o
rm -f dpf/utils/lv2_ttl_generator.d

# --------------------------------------------------------------
Expand Down
72 changes: 72 additions & 0 deletions common/optimization/avx.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#include "avx.hpp"

#include <immintrin.h>

void VSUM32FLOAT_avx(const float* op1, const float* op2, float* result, int32_t length)
{
float extracted[8];
int32_t processed = 0;

while ( processed < length )
{
int processing = ((length - processed) % 8) + 1;

const __m256 mm_op1 = _mm256_loadu_ps(op1 + processed);
const __m256 mm_op2 = _mm256_loadu_ps(op2 + processed );
__m256 mm_res = _mm256_add_ps(mm_op1, mm_op2);

_mm256_storeu_ps(extracted, mm_res);
for ( int i = 0 ; i < processing ; i++ )
{
result[processed + i] = extracted[i];
}

processed += processing;
}
}

void VMUL32FLOAT_avx(const float* op1, const float* op2, float* result, int32_t length)
{
float extracted[8];
int32_t processed = 0;

while ( processed < length )
{
int processing = ((length - processed) % 8) + 1;
const __m256 mm_op1 = _mm256_loadu_ps(op1 + processed);
const __m256 mm_op2 = _mm256_loadu_ps(op2 + processed);
__m256 mm_res = _mm256_mul_ps(mm_op1, mm_op2);

_mm256_storeu_ps(extracted, mm_res);
for ( int i = 0 ; i < processing ; i++ )
{
result[processed + i] = extracted[i];
}

processed += processing;
}
}

void VMUL32FLOAT_V_avx(const float value, const float* vec, float* result, int32_t length)
{
float extracted[8];
const __m256 mm_op1 = _mm256_set1_ps(value);

int processed = 0;
while ( processed < length )
{
int processing = ((length - processed) % 8) + 1;

const __m256 mm_op2 = _mm256_loadu_ps(vec + processed);
__m256 mm_res = _mm256_mul_ps(mm_op1, mm_op2);

_mm256_storeu_ps(extracted, mm_res);
for ( int i = 0 ; i < processing ; i++ )
{
result[processed + i] = extracted[i];
}

processed += processing;
}
}

13 changes: 13 additions & 0 deletions common/optimization/avx.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@

#ifndef OPTIMIZATION_AVX_H_INCLUDED
#define OPTIMIZATION_AVX_H_INCLUDED

#include <stdint.h>

void VSUM32FLOAT_avx(const float* op1, const float* op2, float* result, int32_t length);

void VMUL32FLOAT_avx(const float* op1, const float* op2, float* result, int32_t length);

void VMUL32FLOAT_V_avx(const float value, const float* vec, float* result, int32_t length);

#endif
26 changes: 26 additions & 0 deletions common/optimization/default.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#include "default.hpp"

void VSUM32FLOAT_default(const float* op1, const float* op2, float* result, int32_t length)
{
for ( int32_t i = 0 ; i < length ; i++ )
{
result[i] = op1[i] + op2[i];
}
}

void VMUL32FLOAT_default(const float* op1, const float* op2, float* result, int32_t length)
{
for ( int32_t i = 0 ; i < length ; i++ )
{
result[i] = op1[i] * op2[i];
}
}

void VMUL32FLOAT_V_default(const float value, const float* vec, float* result, int32_t length)
{
for ( int32_t i = 0 ; i < length ; i++ )
{
result[i] = value * vec[i];
}
}

14 changes: 14 additions & 0 deletions common/optimization/default.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@

#ifndef OPTIMIZATION_DEFAULT_H_INCLUDED
#define OPTIMIZATION_DEFAULT_H_INCLUDED

#include <stdint.h>

void VSUM32FLOAT_default(const float* op1, const float* op2, float* result, int32_t length);

void VMUL32FLOAT_default(const float* op1, const float* op2, float* result, int32_t length);

void VMUL32FLOAT_V_default(const float value, const float* vec, float* result, int32_t length);


#endif
65 changes: 65 additions & 0 deletions common/optimization/optimization.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@

#include "optimization.hpp"
#include <stdint.h>

#include "default.hpp"
#include "avx.hpp"

typedef struct
{
void (*VSUM32FLOAT)(const float* op1, const float* op2, float* result, int32_t length);
void (*VMUL32FLOAT)(const float* op1, const float* op2, float* result, int32_t length);
void (*VMUL32FLOAT_V)(const float value, const float* vec, float* result, int32_t length);
} _CpuOptimization;

int8_t OptimizationInitialized = 0;
_CpuOptimization CpuOptimization;

void SetupOptimization()
{
__builtin_cpu_init();
if ( __builtin_cpu_supports("avx") )
{
CpuOptimization.VSUM32FLOAT = VSUM32FLOAT_avx;
CpuOptimization.VMUL32FLOAT = VMUL32FLOAT_avx;
CpuOptimization.VMUL32FLOAT_V = VMUL32FLOAT_V_avx;
}
else
{
CpuOptimization.VSUM32FLOAT = VSUM32FLOAT_default;
CpuOptimization.VMUL32FLOAT = VMUL32FLOAT_default;
CpuOptimization.VMUL32FLOAT_V = VMUL32FLOAT_V_default;
}
OptimizationInitialized = 1;
}


void VSUM32FLOAT(const float* op1, const float* op2, float* result, int32_t length)
{
if (!OptimizationInitialized)
{
SetupOptimization();
}

CpuOptimization.VSUM32FLOAT(op1,op2,result,length);
}

void VMUL32FLOAT(const float* op1, const float* op2, float* result, int32_t length)
{
if (!OptimizationInitialized)
{
SetupOptimization();
}

CpuOptimization.VMUL32FLOAT(op1,op2,result,length);
}

void VMUL32FLOAT_V(const float value, const float* vec, float* result, int32_t length)
{
if (!OptimizationInitialized)
{
SetupOptimization();
}

CpuOptimization.VMUL32FLOAT_V(value,vec,result,length);
}
13 changes: 13 additions & 0 deletions common/optimization/optimization.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@

#ifndef OPTIMIZATION_H_INCLUDED
#define OPTIMIZATION_H_INCLUDED

#include <stdint.h>

void VSUM32FLOAT(const float* op1, const float* op2, float* result, int32_t length);

void VMUL32FLOAT(const float* op1, const float* op2, float* result, int32_t length);

void VMUL32FLOAT_V(const float value, const float* vec, float* result, int32_t length);

#endif
16 changes: 7 additions & 9 deletions plugins/dragonfly-early-reflections/DSP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "DistrhoPlugin.hpp"
#include "DistrhoPluginInfo.h"
#include "extra/ScopedDenormalDisable.hpp"
#include "optimization/optimization.hpp"

#include "DSP.hpp"

Expand Down Expand Up @@ -91,16 +92,13 @@ void DragonflyReverbDSP::run(const float** inputs, float** outputs, uint32_t fra
buffer_frames
);

for (uint32_t i = 0; i < buffer_frames; i++) {
outputs[0][offset + i] =
dryLevel * inputs[0][offset + i] +
wetLevel * output_buffer[0][i];

outputs[1][offset + i] =
dryLevel * inputs[1][offset + i] +
wetLevel * output_buffer[1][i];
}
VMUL32FLOAT_V( dryLevel, &inputs[0][offset], dry_buffer, buffer_frames );
VMUL32FLOAT_V( wetLevel, output_buffer[0], wet_buffer, buffer_frames );
VSUM32FLOAT(dry_buffer, wet_buffer, &outputs[0][offset], buffer_frames );

VMUL32FLOAT_V( dryLevel, &inputs[1][offset], dry_buffer, buffer_frames );
VMUL32FLOAT_V( wetLevel, output_buffer[1], wet_buffer, buffer_frames );
VSUM32FLOAT(dry_buffer, wet_buffer, &outputs[1][offset], buffer_frames );
}
}

Expand Down
3 changes: 3 additions & 0 deletions plugins/dragonfly-early-reflections/DSP.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ class DragonflyReverbDSP : public AbstractDSP {
float input_buffer[2][BUFFER_SIZE];
float output_buffer[2][BUFFER_SIZE];

float dry_buffer[BUFFER_SIZE];
float wet_buffer[BUFFER_SIZE];

void setInputLPF(float freq);
void setInputHPF(float freq);
};
Expand Down
7 changes: 5 additions & 2 deletions plugins/dragonfly-early-reflections/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@ NAME = DragonflyEarlyReflections

FILES_COMMON = DSP.cpp \
../../common/kiss_fft/kiss_fft.c \
../../common/kiss_fft/kiss_fftr.c
../../common/kiss_fft/kiss_fftr.c \
../../common/optimization/optimization.cpp \
../../common/optimization/default.cpp \
../../common/optimization/avx.cpp

ifneq ($(SYSTEM_FREEVERB3),true)
FILES_COMMON += \
Expand Down Expand Up @@ -58,7 +61,7 @@ include ../../dpf/Makefile.plugins.mk
# --------------------------------------------------------------
# Build dependencies

BUILD_CXX_FLAGS += -I../../common -DLIBFV3_FLOAT
BUILD_CXX_FLAGS += -I../../common -DLIBFV3_FLOAT -mavx
ifeq ($(SYSTEM_FREEVERB3),true)
BUILD_CXX_FLAGS += -DLIBSRATE1
BUILD_CXX_FLAGS += $(shell $(PKG_CONFIG) --cflags freeverb3-3)
Expand Down
34 changes: 18 additions & 16 deletions plugins/dragonfly-hall-reverb/DSP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "DistrhoPlugin.hpp"
#include "DistrhoPluginInfo.h"
#include "extra/ScopedDenormalDisable.hpp"
#include "optimization/optimization.hpp"

#include "DSP.hpp"

Expand Down Expand Up @@ -116,10 +117,11 @@ void DragonflyReverbDSP::run(const float** inputs, float** outputs, uint32_t fra
early_out_buffer[1],
buffer_frames);

for (uint32_t i = 0; i < buffer_frames; i++) {
late_in_buffer[0][i] = early_send * early_out_buffer[0][i] + inputs[0][offset + i];
late_in_buffer[1][i] = early_send * early_out_buffer[1][i] + inputs[1][offset + i];
}
VMUL32FLOAT_V(early_send, early_out_buffer[0], early_buffer, buffer_frames);
VSUM32FLOAT(early_buffer, &inputs[0][offset], late_in_buffer[0], buffer_frames );

VMUL32FLOAT_V(early_send, early_out_buffer[1], early_buffer, buffer_frames);
VSUM32FLOAT(early_buffer, &inputs[1][offset], late_in_buffer[1], buffer_frames );

late.processreplace(
const_cast<float *>(late_in_buffer[0]),
Expand All @@ -128,23 +130,23 @@ void DragonflyReverbDSP::run(const float** inputs, float** outputs, uint32_t fra
late_out_buffer[1],
buffer_frames);

for (uint32_t i = 0; i < buffer_frames; i++) {
outputs[0][offset + i] = dryLevel * inputs[0][offset + i];
outputs[1][offset + i] = dryLevel * inputs[1][offset + i];
}
VMUL32FLOAT_V(dryLevel, &inputs[0][offset], &outputs[0][offset], buffer_frames);
VMUL32FLOAT_V(dryLevel, &inputs[1][offset], &outputs[1][offset], buffer_frames);

if( earlyLevel > 0.0 ){
for (uint32_t i = 0; i < buffer_frames; i++) {
outputs[0][offset + i] += earlyLevel * early_out_buffer[0][i];
outputs[1][offset + i] += earlyLevel * early_out_buffer[1][i];
}
VMUL32FLOAT_V(earlyLevel, early_out_buffer[0], early_buffer, buffer_frames);
VSUM32FLOAT(&outputs[0][offset], early_buffer, &outputs[0][offset], buffer_frames);

VMUL32FLOAT_V(earlyLevel, early_out_buffer[1], early_buffer, buffer_frames);
VSUM32FLOAT(&outputs[1][offset], early_buffer, &outputs[1][offset], buffer_frames);
}

if( lateLevel > 0.0 ){
for (uint32_t i = 0; i < buffer_frames; i++) {
outputs[0][offset + i] += lateLevel * late_out_buffer[0][i];
outputs[1][offset + i] += lateLevel * late_out_buffer[1][i];
}
VMUL32FLOAT_V(lateLevel, late_out_buffer[0], late_buffer, buffer_frames);
VSUM32FLOAT(&outputs[0][offset], late_buffer, &outputs[0][offset], buffer_frames);

VMUL32FLOAT_V(lateLevel, late_out_buffer[1], late_buffer, buffer_frames);
VSUM32FLOAT(&outputs[1][offset], late_buffer, &outputs[1][offset], buffer_frames);
}
}
}
Expand Down
3 changes: 3 additions & 0 deletions plugins/dragonfly-hall-reverb/DSP.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ class DragonflyReverbDSP : public AbstractDSP {
float early_out_buffer[2][BUFFER_SIZE];
float late_in_buffer[2][BUFFER_SIZE];
float late_out_buffer[2][BUFFER_SIZE];

float early_buffer[BUFFER_SIZE];
float late_buffer[BUFFER_SIZE];
};

#endif
7 changes: 5 additions & 2 deletions plugins/dragonfly-hall-reverb/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@ NAME = DragonflyHallReverb

FILES_COMMON = DSP.cpp \
../../common/kiss_fft/kiss_fft.c \
../../common/kiss_fft/kiss_fftr.c
../../common/kiss_fft/kiss_fftr.c \
../../common/optimization/optimization.cpp \
../../common/optimization/default.cpp \
../../common/optimization/avx.cpp

ifneq ($(SYSTEM_FREEVERB3),true)
FILES_COMMON += \
Expand Down Expand Up @@ -59,7 +62,7 @@ include ../../dpf/Makefile.plugins.mk
# --------------------------------------------------------------
# Build dependencies

BUILD_CXX_FLAGS += -I../../common -DLIBFV3_FLOAT
BUILD_CXX_FLAGS += -I../../common -DLIBFV3_FLOAT -mavx
ifeq ($(SYSTEM_FREEVERB3),true)
BUILD_CXX_FLAGS += -DLIBSRATE1
BUILD_CXX_FLAGS += $(shell $(PKG_CONFIG) --cflags freeverb3-3)
Expand Down
Loading

0 comments on commit 6e0b494

Please sign in to comment.