Skip to content
This repository was archived by the owner on Dec 16, 2021. It is now read-only.

Commit b5ae6b2

Browse files
committed
Split out SSE2 and SSSE3 versions
1 parent 4c5a3fb commit b5ae6b2

File tree

7 files changed

+744
-680
lines changed

7 files changed

+744
-680
lines changed

Makefile

+32-16
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,57 @@
11

2-
CC=g++
3-
WARNINGS= -Wall -Wextra
4-
OPTFLAGS=-O2 -march=native
2+
CXX=g++
3+
WARNINGS=-Wall -Wextra
4+
OPTFLAGS=-std=c++11 -O2 -fPIC
55
DEBUGFLAGS=-g
66

7-
CFLAGS=$(OPTFLAGS) $(DEBUGFLAGS) $(WARNINGS)
7+
CXXFLAGS=$(OPTFLAGS) $(DEBUGFLAGS) $(WARNINGS)
88

99
PROGS = benchmark zfec test_recovery gen_test_vec
1010

11-
all: fecpp.so $(PROGS)
11+
all: fecpp.so pyfecpp.so $(PROGS)
1212

13-
libfecpp.a: fecpp.o
14-
ar crs $@ $<
13+
PYTHON_PKGCONFIG=python2
14+
15+
OBJ=fecpp.o cpuid.o fecpp_sse2.o fecpp_ssse3.o
16+
17+
libfecpp.a: $(OBJ)
18+
ar crs $@ $(OBJ)
1519

1620
fecpp.o: fecpp.cpp fecpp.h
17-
$(CC) $(CFLAGS) -I. -c $< -o $@
21+
$(CXX) $(CXXFLAGS) -I. -c $< -o $@
22+
23+
cpuid.o: cpuid.cpp fecpp.h
24+
$(CXX) $(CXXFLAGS) -I. -c $< -o $@
25+
26+
fecpp_sse2.o: fecpp_sse2.cpp fecpp.h
27+
$(CXX) $(CXXFLAGS) -msse2 -I. -c $< -o $@
28+
29+
fecpp_ssse3.o: fecpp_ssse3.cpp fecpp.h
30+
$(CXX) $(CXXFLAGS) -mssse3 -I. -c $< -o $@
1831

1932
test/%.o: test/%.cpp fecpp.h
20-
$(CC) $(CFLAGS) -I. -c $< -o $@
33+
$(CXX) $(CXXFLAGS) -I. -c $< -o $@
2134

2235
zfec: test/zfec.o libfecpp.a
23-
$(CC) $(CFLAGS) $< -L. -lfecpp -o $@
36+
$(CXX) $(CXXFLAGS) $< -L. -lfecpp -o $@
2437

2538
benchmark: test/benchmark.o libfecpp.a
26-
$(CC) $(CFLAGS) $< -L. -lfecpp -o $@
39+
$(CXX) $(CXXFLAGS) $< -L. -lfecpp -o $@
2740

2841
test_fec: test/test_fec.o libfecpp.a
29-
$(CC) $(CFLAGS) $< -L. -lfecpp -o $@
42+
$(CXX) $(CXXFLAGS) $< -L. -lfecpp -o $@
3043

3144
test_recovery: test/test_recovery.o libfecpp.a
32-
$(CC) $(CFLAGS) $< -L. -lfecpp -o $@
45+
$(CXX) $(CXXFLAGS) $< -L. -lfecpp -o $@
3346

3447
gen_test_vec: test/gen_test_vec.o libfecpp.a
35-
$(CC) $(CFLAGS) $< -L. -lfecpp -o $@
48+
$(CXX) $(CXXFLAGS) $< -L. -lfecpp -o $@
49+
50+
fecpp.so: $(OBJ) fecpp.h
51+
$(CXX) -shared -fPIC $(CXXFLAGS) $(OBJ) -o fecpp.so
3652

37-
fecpp.so: fecpp.cpp fecpp_python.cpp fecpp.h
38-
$(CC) -shared -fPIC $(CFLAGS) `pkg-config --cflags python` fecpp.cpp fecpp_python.cpp `pkg-config --libs python` -lboost_python -o fecpp.so
53+
pyfecpp.so: fecpp_python.cpp fecpp.h fecpp.so
54+
$(CXX) -shared -fPIC $(CXXFLAGS) `pkg-config --cflags $(PYTHON_PKGCONFIG)` fecpp_python.cpp `pkg-config --libs $(PYTHON_PKGCONFIG)` -lboost_python fecpp.so -o pyfecpp.so
3955

4056
clean:
4157
rm -f fecpp.so *.a *.o test/*.o

cpuid.cpp

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
2+
#include "fecpp.h"
3+
4+
namespace fecpp {
5+
6+
bool has_sse2() { return true; }
7+
8+
bool has_ssse3() { return true; }
9+
10+
}

fecpp.cpp

+7-660
Large diffs are not rendered by default.

fecpp.h

+20-3
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,13 @@
22
* Forward error correction based on Vandermonde matrices
33
*
44
* (C) 1997-1998 Luigi Rizzo ([email protected])
5-
* (C) 2009 Jack Lloyd (lloyd@randombit.net)
5+
* (C) 2009 Jack Lloyd (jack@randombit.net)
66
*
77
* Distributed under the terms given in license.txt
88
*/
99

10-
#ifndef FECPP_H__
11-
#define FECPP_H__
10+
#ifndef FECPP_H_
11+
#define FECPP_H_
1212

1313
#include <map>
1414
#include <vector>
@@ -22,6 +22,10 @@ using std::size_t;
2222

2323
using byte = std::uint8_t;
2424

25+
#if defined(__i386__) || defined(__x86_64__)
26+
#define FECPP_IS_X86
27+
#endif
28+
2529
/**
2630
* Forward error correction code
2731
*/
@@ -63,6 +67,19 @@ class fec_code
6367
std::vector<uint8_t> enc_matrix;
6468
};
6569

70+
#if defined(FECPP_IS_X86)
71+
72+
/**
73+
* CPU runtime detection
74+
*/
75+
bool has_sse2();
76+
bool has_ssse3();
77+
78+
size_t addmul_sse2(uint8_t z[], const uint8_t x[], uint8_t y, size_t size);
79+
size_t addmul_ssse3(uint8_t z[], const uint8_t x[], uint8_t y, size_t size);
80+
81+
#endif
82+
6683
}
6784

6885
#endif

fecpp_sse2.cpp

+100
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
/*
2+
* (C) 2009-2010 Jack Lloyd ([email protected])
3+
*
4+
* Distributed under the terms given in license.txt (Simplified BSD)
5+
*/
6+
7+
#include "fecpp.h"
8+
#include <emmintrin.h>
9+
10+
namespace fecpp {
11+
12+
size_t addmul_sse2(uint8_t z[], const uint8_t x[], uint8_t y, size_t size)
13+
{
14+
const __m128i polynomial = _mm_set1_epi8(0x1D);
15+
16+
const size_t y_bits = 32 - __builtin_clz(y);
17+
18+
// unrolled out to cache line size
19+
while(size >= 64)
20+
{
21+
__m128i x_1 = _mm_loadu_si128((const __m128i*)(x));
22+
__m128i x_2 = _mm_loadu_si128((const __m128i*)(x + 16));
23+
__m128i x_3 = _mm_loadu_si128((const __m128i*)(x + 32));
24+
__m128i x_4 = _mm_loadu_si128((const __m128i*)(x + 48));
25+
26+
__m128i z_1 = _mm_load_si128((const __m128i*)(z));
27+
__m128i z_2 = _mm_load_si128((const __m128i*)(z + 16));
28+
__m128i z_3 = _mm_load_si128((const __m128i*)(z + 32));
29+
__m128i z_4 = _mm_load_si128((const __m128i*)(z + 48));
30+
31+
// prefetch next two x and z blocks
32+
_mm_prefetch(x + 64, _MM_HINT_T0);
33+
_mm_prefetch(z + 64, _MM_HINT_T0);
34+
_mm_prefetch(x + 128, _MM_HINT_T1);
35+
_mm_prefetch(z + 128, _MM_HINT_T1);
36+
37+
if(y & 0x01)
38+
{
39+
z_1 = _mm_xor_si128(z_1, x_1);
40+
z_2 = _mm_xor_si128(z_2, x_2);
41+
z_3 = _mm_xor_si128(z_3, x_3);
42+
z_4 = _mm_xor_si128(z_4, x_4);
43+
}
44+
45+
for(size_t j = 1; j != y_bits; ++j)
46+
{
47+
/*
48+
* Each byte of each mask is either 0 or the polynomial 0x1D,
49+
* depending on if the high bit of x_i is set or not.
50+
*/
51+
52+
__m128i mask_1 = _mm_setzero_si128();
53+
__m128i mask_2 = _mm_setzero_si128();
54+
__m128i mask_3 = _mm_setzero_si128();
55+
__m128i mask_4 = _mm_setzero_si128();
56+
57+
// flip operation?
58+
mask_1 = _mm_cmpgt_epi8(mask_1, x_1);
59+
mask_2 = _mm_cmpgt_epi8(mask_2, x_2);
60+
mask_3 = _mm_cmpgt_epi8(mask_3, x_3);
61+
mask_4 = _mm_cmpgt_epi8(mask_4, x_4);
62+
63+
x_1 = _mm_add_epi8(x_1, x_1);
64+
x_2 = _mm_add_epi8(x_2, x_2);
65+
x_3 = _mm_add_epi8(x_3, x_3);
66+
x_4 = _mm_add_epi8(x_4, x_4);
67+
68+
mask_1 = _mm_and_si128(mask_1, polynomial);
69+
mask_2 = _mm_and_si128(mask_2, polynomial);
70+
mask_3 = _mm_and_si128(mask_3, polynomial);
71+
mask_4 = _mm_and_si128(mask_4, polynomial);
72+
73+
x_1 = _mm_xor_si128(x_1, mask_1);
74+
x_2 = _mm_xor_si128(x_2, mask_2);
75+
x_3 = _mm_xor_si128(x_3, mask_3);
76+
x_4 = _mm_xor_si128(x_4, mask_4);
77+
78+
if((y >> j) & 1)
79+
{
80+
z_1 = _mm_xor_si128(z_1, x_1);
81+
z_2 = _mm_xor_si128(z_2, x_2);
82+
z_3 = _mm_xor_si128(z_3, x_3);
83+
z_4 = _mm_xor_si128(z_4, x_4);
84+
}
85+
}
86+
87+
_mm_store_si128((__m128i*)(z ), z_1);
88+
_mm_store_si128((__m128i*)(z + 16), z_2);
89+
_mm_store_si128((__m128i*)(z + 32), z_3);
90+
_mm_store_si128((__m128i*)(z + 48), z_4);
91+
92+
x += 64;
93+
z += 64;
94+
size -= 64;
95+
}
96+
97+
return size;
98+
}
99+
100+
}

0 commit comments

Comments
 (0)