-
-
Notifications
You must be signed in to change notification settings - Fork 71
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added cpu optimization detection and avx optimization
- Loading branch information
Showing
19 changed files
with
286 additions
and
58 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
#include "avx.hpp" | ||
|
||
#include <immintrin.h> | ||
|
||
void VSUM32FLOAT_avx(const float* op1, const float* op2, float* result, int32_t length) | ||
{ | ||
float extracted[8]; | ||
int32_t processed = 0; | ||
|
||
while ( processed < length ) | ||
{ | ||
int processing = ((length - processed) % 8) + 1; | ||
|
||
const __m256 mm_op1 = _mm256_loadu_ps(op1 + processed); | ||
const __m256 mm_op2 = _mm256_loadu_ps(op2 + processed ); | ||
__m256 mm_res = _mm256_add_ps(mm_op1, mm_op2); | ||
|
||
_mm256_storeu_ps(extracted, mm_res); | ||
for ( int i = 0 ; i < processing ; i++ ) | ||
{ | ||
result[processed + i] = extracted[i]; | ||
} | ||
|
||
processed += processing; | ||
} | ||
} | ||
|
||
void VMUL32FLOAT_avx(const float* op1, const float* op2, float* result, int32_t length) | ||
{ | ||
float extracted[8]; | ||
int32_t processed = 0; | ||
|
||
while ( processed < length ) | ||
{ | ||
int processing = ((length - processed) % 8) + 1; | ||
const __m256 mm_op1 = _mm256_loadu_ps(op1 + processed); | ||
const __m256 mm_op2 = _mm256_loadu_ps(op2 + processed); | ||
__m256 mm_res = _mm256_mul_ps(mm_op1, mm_op2); | ||
|
||
_mm256_storeu_ps(extracted, mm_res); | ||
for ( int i = 0 ; i < processing ; i++ ) | ||
{ | ||
result[processed + i] = extracted[i]; | ||
} | ||
|
||
processed += processing; | ||
} | ||
} | ||
|
||
void VMUL32FLOAT_V_avx(const float value, const float* vec, float* result, int32_t length) | ||
{ | ||
float extracted[8]; | ||
const __m256 mm_op1 = _mm256_set1_ps(value); | ||
|
||
int processed = 0; | ||
while ( processed < length ) | ||
{ | ||
int processing = ((length - processed) % 8) + 1; | ||
|
||
const __m256 mm_op2 = _mm256_loadu_ps(vec + processed); | ||
__m256 mm_res = _mm256_mul_ps(mm_op1, mm_op2); | ||
|
||
_mm256_storeu_ps(extracted, mm_res); | ||
for ( int i = 0 ; i < processing ; i++ ) | ||
{ | ||
result[processed + i] = extracted[i]; | ||
} | ||
|
||
processed += processing; | ||
} | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
|
||
#ifndef OPTIMIZATION_AVX_H_INCLUDED | ||
#define OPTIMIZATION_AVX_H_INCLUDED | ||
|
||
#include <stdint.h> | ||
|
||
void VSUM32FLOAT_avx(const float* op1, const float* op2, float* result, int32_t length); | ||
|
||
void VMUL32FLOAT_avx(const float* op1, const float* op2, float* result, int32_t length); | ||
|
||
void VMUL32FLOAT_V_avx(const float value, const float* vec, float* result, int32_t length); | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
#include "default.hpp" | ||
|
||
void VSUM32FLOAT_default(const float* op1, const float* op2, float* result, int32_t length) | ||
{ | ||
for ( int32_t i = 0 ; i < length ; i++ ) | ||
{ | ||
result[i] = op1[i] + op2[i]; | ||
} | ||
} | ||
|
||
void VMUL32FLOAT_default(const float* op1, const float* op2, float* result, int32_t length) | ||
{ | ||
for ( int32_t i = 0 ; i < length ; i++ ) | ||
{ | ||
result[i] = op1[i] * op2[i]; | ||
} | ||
} | ||
|
||
void VMUL32FLOAT_V_default(const float value, const float* vec, float* result, int32_t length) | ||
{ | ||
for ( int32_t i = 0 ; i < length ; i++ ) | ||
{ | ||
result[i] = value * vec[i]; | ||
} | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
|
||
#ifndef OPTIMIZATION_DEFAULT_H_INCLUDED | ||
#define OPTIMIZATION_DEFAULT_H_INCLUDED | ||
|
||
#include <stdint.h> | ||
|
||
void VSUM32FLOAT_default(const float* op1, const float* op2, float* result, int32_t length); | ||
|
||
void VMUL32FLOAT_default(const float* op1, const float* op2, float* result, int32_t length); | ||
|
||
void VMUL32FLOAT_V_default(const float value, const float* vec, float* result, int32_t length); | ||
|
||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
|
||
#include "optimization.hpp" | ||
#include <stdint.h> | ||
|
||
#include "default.hpp" | ||
#include "avx.hpp" | ||
|
||
typedef struct | ||
{ | ||
void (*VSUM32FLOAT)(const float* op1, const float* op2, float* result, int32_t length); | ||
void (*VMUL32FLOAT)(const float* op1, const float* op2, float* result, int32_t length); | ||
void (*VMUL32FLOAT_V)(const float value, const float* vec, float* result, int32_t length); | ||
} _CpuOptimization; | ||
|
||
int8_t OptimizationInitialized = 0; | ||
_CpuOptimization CpuOptimization; | ||
|
||
void SetupOptimization() | ||
{ | ||
__builtin_cpu_init(); | ||
if ( __builtin_cpu_supports("avx") ) | ||
{ | ||
CpuOptimization.VSUM32FLOAT = VSUM32FLOAT_avx; | ||
CpuOptimization.VMUL32FLOAT = VMUL32FLOAT_avx; | ||
CpuOptimization.VMUL32FLOAT_V = VMUL32FLOAT_V_avx; | ||
} | ||
else | ||
{ | ||
CpuOptimization.VSUM32FLOAT = VSUM32FLOAT_default; | ||
CpuOptimization.VMUL32FLOAT = VMUL32FLOAT_default; | ||
CpuOptimization.VMUL32FLOAT_V = VMUL32FLOAT_V_default; | ||
} | ||
OptimizationInitialized = 1; | ||
} | ||
|
||
|
||
void VSUM32FLOAT(const float* op1, const float* op2, float* result, int32_t length) | ||
{ | ||
if (!OptimizationInitialized) | ||
{ | ||
SetupOptimization(); | ||
} | ||
|
||
CpuOptimization.VSUM32FLOAT(op1,op2,result,length); | ||
} | ||
|
||
void VMUL32FLOAT(const float* op1, const float* op2, float* result, int32_t length) | ||
{ | ||
if (!OptimizationInitialized) | ||
{ | ||
SetupOptimization(); | ||
} | ||
|
||
CpuOptimization.VMUL32FLOAT(op1,op2,result,length); | ||
} | ||
|
||
void VMUL32FLOAT_V(const float value, const float* vec, float* result, int32_t length) | ||
{ | ||
if (!OptimizationInitialized) | ||
{ | ||
SetupOptimization(); | ||
} | ||
|
||
CpuOptimization.VMUL32FLOAT_V(value,vec,result,length); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
|
||
#ifndef OPTIMIZATION_H_INCLUDED | ||
#define OPTIMIZATION_H_INCLUDED | ||
|
||
#include <stdint.h> | ||
|
||
void VSUM32FLOAT(const float* op1, const float* op2, float* result, int32_t length); | ||
|
||
void VMUL32FLOAT(const float* op1, const float* op2, float* result, int32_t length); | ||
|
||
void VMUL32FLOAT_V(const float value, const float* vec, float* result, int32_t length); | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.