Skip to content

Commit a78c91a

Browse files
Hello71terrelln
authored andcommitted
Use proper unaligned access attributes
Instead of using packed attribute hack, just use aligned attribute. It improves code generation on armv6 and armv7, and slightly improves code generation on aarch64. GCC generates identical code to regular aligned access on ARMv6 for all versions between 4.5 and trunk, except GCC 5 which is buggy and generates the same (bad) code as packed access: https://gcc.godbolt.org/z/hq37rz7sb
1 parent fbff782 commit a78c91a

File tree

10 files changed

+106
-167
lines changed

10 files changed

+106
-167
lines changed

contrib/linux-kernel/Makefile

-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ libzstd:
3434
-DFSE_STATIC_LINKING_ONLY \
3535
-DHUF_STATIC_LINKING_ONLY \
3636
-DXXH_STATIC_LINKING_ONLY \
37-
-DMEM_FORCE_MEMORY_ACCESS=0 \
3837
-D__GNUC__ \
3938
-D__linux__=1 \
4039
-DSTATIC_BMI2=0 \

lib/common/mem.h

+16-33
Original file line numberDiff line numberDiff line change
@@ -133,21 +133,15 @@ MEM_STATIC size_t MEM_swapST(size_t in);
133133
/*-**************************************************************
134134
* Memory I/O Implementation
135135
*****************************************************************/
136-
/* MEM_FORCE_MEMORY_ACCESS :
137-
* By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
138-
* Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
139-
* The below switch allow to select different access method for improved performance.
140-
* Method 0 (default) : use `memcpy()`. Safe and portable.
141-
* Method 1 : `__packed` statement. It depends on compiler extension (i.e., not portable).
142-
* This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
136+
/* MEM_FORCE_MEMORY_ACCESS : For accessing unaligned memory:
137+
* Method 0 : always use `memcpy()`. Safe and portable.
138+
* Method 1 : Use compiler extension to set unaligned access.
143139
* Method 2 : direct access. This method is portable but violate C standard.
144140
* It can generate buggy code on targets depending on alignment.
145-
* In some circumstances, it's the only known way to get the most performance (i.e. GCC + ARMv6)
146-
* See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
147-
* Prefer these methods in priority order (0 > 1 > 2)
141+
* Default : method 1 if supported, else method 0
148142
*/
149143
#ifndef MEM_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */
150-
# if defined(__INTEL_COMPILER) || defined(__GNUC__) || defined(__ICCARM__)
144+
# ifdef __GNUC__
151145
# define MEM_FORCE_MEMORY_ACCESS 1
152146
# endif
153147
#endif
@@ -190,30 +184,19 @@ MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(U64*)memPtr = value; }
190184

191185
#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)
192186

193-
/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
194-
/* currently only defined for gcc and icc */
195-
#if defined(_MSC_VER) || (defined(__INTEL_COMPILER) && defined(WIN32))
196-
__pragma( pack(push, 1) )
197-
typedef struct { U16 v; } unalign16;
198-
typedef struct { U32 v; } unalign32;
199-
typedef struct { U64 v; } unalign64;
200-
typedef struct { size_t v; } unalignArch;
201-
__pragma( pack(pop) )
202-
#else
203-
typedef struct { U16 v; } __attribute__((packed)) unalign16;
204-
typedef struct { U32 v; } __attribute__((packed)) unalign32;
205-
typedef struct { U64 v; } __attribute__((packed)) unalign64;
206-
typedef struct { size_t v; } __attribute__((packed)) unalignArch;
207-
#endif
187+
typedef __attribute__((aligned(1))) U16 unalign16;
188+
typedef __attribute__((aligned(1))) U32 unalign32;
189+
typedef __attribute__((aligned(1))) U64 unalign64;
190+
typedef __attribute__((aligned(1))) size_t unalignArch;
208191

209-
MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign16*)ptr)->v; }
210-
MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign32*)ptr)->v; }
211-
MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign64*)ptr)->v; }
212-
MEM_STATIC size_t MEM_readST(const void* ptr) { return ((const unalignArch*)ptr)->v; }
192+
MEM_STATIC U16 MEM_read16(const void* ptr) { return *(const unalign16*)ptr; }
193+
MEM_STATIC U32 MEM_read32(const void* ptr) { return *(const unalign32*)ptr; }
194+
MEM_STATIC U64 MEM_read64(const void* ptr) { return *(const unalign64*)ptr; }
195+
MEM_STATIC size_t MEM_readST(const void* ptr) { return *(const unalignArch*)ptr; }
213196

214-
MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign16*)memPtr)->v = value; }
215-
MEM_STATIC void MEM_write32(void* memPtr, U32 value) { ((unalign32*)memPtr)->v = value; }
216-
MEM_STATIC void MEM_write64(void* memPtr, U64 value) { ((unalign64*)memPtr)->v = value; }
197+
MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(unalign16*)memPtr = value; }
198+
MEM_STATIC void MEM_write32(void* memPtr, U32 value) { *(unalign32*)memPtr = value; }
199+
MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(unalign64*)memPtr = value; }
217200

218201
#else
219202

lib/legacy/zstd_v01.c

+12-18
Original file line numberDiff line numberDiff line change
@@ -190,21 +190,15 @@ typedef signed long long S64;
190190
/****************************************************************
191191
* Memory I/O
192192
*****************************************************************/
193-
/* FSE_FORCE_MEMORY_ACCESS
194-
* By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
195-
* Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
196-
* The below switch allow to select different access method for improved performance.
197-
* Method 0 (default) : use `memcpy()`. Safe and portable.
198-
* Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
199-
* This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
193+
/* FSE_FORCE_MEMORY_ACCESS : For accessing unaligned memory:
194+
* Method 0 : always use `memcpy()`. Safe and portable.
195+
* Method 1 : Use compiler extension to set unaligned access.
200196
* Method 2 : direct access. This method is portable but violate C standard.
201-
* It can generate buggy code on targets generating assembly depending on alignment.
202-
* But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
203-
* See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
204-
* Prefer these methods in priority order (0 > 1 > 2)
197+
* It can generate buggy code on targets depending on alignment.
198+
* Default : method 1 if supported, else method 0
205199
*/
206200
#ifndef FSE_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */
207-
# if defined(__INTEL_COMPILER) || defined(__GNUC__) || defined(__ICCARM__)
201+
# ifdef __GNUC__
208202
# define FSE_FORCE_MEMORY_ACCESS 1
209203
# endif
210204
#endif
@@ -229,13 +223,13 @@ static U64 FSE_read64(const void* memPtr) { return *(const U64*) memPtr; }
229223

230224
#elif defined(FSE_FORCE_MEMORY_ACCESS) && (FSE_FORCE_MEMORY_ACCESS==1)
231225

232-
/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
233-
/* currently only defined for gcc and icc */
234-
typedef union { U16 u16; U32 u32; U64 u64; } __attribute__((packed)) unalign;
226+
typedef __attribute__((aligned(1))) U16 unalign16;
227+
typedef __attribute__((aligned(1))) U32 unalign32;
228+
typedef __attribute__((aligned(1))) U64 unalign64;
235229

236-
static U16 FSE_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
237-
static U32 FSE_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
238-
static U64 FSE_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
230+
static U16 FSE_read16(const void* ptr) { return *(const unalign16*)ptr; }
231+
static U32 FSE_read32(const void* ptr) { return *(const unalign32*)ptr; }
232+
static U64 FSE_read64(const void* ptr) { return *(const unalign64*)ptr; }
239233

240234
#else
241235

lib/legacy/zstd_v02.c

+13-19
Original file line numberDiff line numberDiff line change
@@ -115,21 +115,15 @@ extern "C" {
115115
/****************************************************************
116116
* Memory I/O
117117
*****************************************************************/
118-
/* MEM_FORCE_MEMORY_ACCESS
119-
* By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
120-
* Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
121-
* The below switch allow to select different access method for improved performance.
122-
* Method 0 (default) : use `memcpy()`. Safe and portable.
123-
* Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
124-
* This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
118+
/* MEM_FORCE_MEMORY_ACCESS : For accessing unaligned memory:
119+
* Method 0 : always use `memcpy()`. Safe and portable.
120+
* Method 1 : Use compiler extension to set unaligned access.
125121
* Method 2 : direct access. This method is portable but violate C standard.
126-
* It can generate buggy code on targets generating assembly depending on alignment.
127-
* But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
128-
* See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
129-
* Prefer these methods in priority order (0 > 1 > 2)
122+
* It can generate buggy code on targets depending on alignment.
123+
* Default : method 1 if supported, else method 0
130124
*/
131125
#ifndef MEM_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */
132-
# if defined(__INTEL_COMPILER) || defined(__GNUC__) || defined(__ICCARM__)
126+
# ifdef __GNUC__
133127
# define MEM_FORCE_MEMORY_ACCESS 1
134128
# endif
135129
#endif
@@ -155,15 +149,15 @@ MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
155149

156150
#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)
157151

158-
/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
159-
/* currently only defined for gcc and icc */
160-
typedef union { U16 u16; U32 u32; U64 u64; } __attribute__((packed)) unalign;
152+
typedef __attribute__((aligned(1))) U16 unalign16;
153+
typedef __attribute__((aligned(1))) U32 unalign32;
154+
typedef __attribute__((aligned(1))) U64 unalign64;
161155

162-
MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
163-
MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
164-
MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
156+
MEM_STATIC U16 MEM_read16(const void* ptr) { return *(const unalign16*)ptr; }
157+
MEM_STATIC U32 MEM_read32(const void* ptr) { return *(const unalign32*)ptr; }
158+
MEM_STATIC U64 MEM_read64(const void* ptr) { return *(const unalign64*)ptr; }
165159

166-
MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
160+
MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(unalign16*)memPtr = value; }
167161

168162
#else
169163

lib/legacy/zstd_v03.c

+13-19
Original file line numberDiff line numberDiff line change
@@ -116,21 +116,15 @@ extern "C" {
116116
/****************************************************************
117117
* Memory I/O
118118
*****************************************************************/
119-
/* MEM_FORCE_MEMORY_ACCESS
120-
* By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
121-
* Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
122-
* The below switch allow to select different access method for improved performance.
123-
* Method 0 (default) : use `memcpy()`. Safe and portable.
124-
* Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
125-
* This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
119+
/* MEM_FORCE_MEMORY_ACCESS : For accessing unaligned memory:
120+
* Method 0 : always use `memcpy()`. Safe and portable.
121+
* Method 1 : Use compiler extension to set unaligned access.
126122
* Method 2 : direct access. This method is portable but violate C standard.
127-
* It can generate buggy code on targets generating assembly depending on alignment.
128-
* But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
129-
* See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
130-
* Prefer these methods in priority order (0 > 1 > 2)
123+
* It can generate buggy code on targets depending on alignment.
124+
* Default : method 1 if supported, else method 0
131125
*/
132126
#ifndef MEM_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */
133-
# if defined(__INTEL_COMPILER) || defined(__GNUC__) || defined(__ICCARM__)
127+
# ifdef __GNUC__
134128
# define MEM_FORCE_MEMORY_ACCESS 1
135129
# endif
136130
#endif
@@ -156,15 +150,15 @@ MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
156150

157151
#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)
158152

159-
/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
160-
/* currently only defined for gcc and icc */
161-
typedef union { U16 u16; U32 u32; U64 u64; } __attribute__((packed)) unalign;
153+
typedef __attribute__((aligned(1))) U16 unalign16;
154+
typedef __attribute__((aligned(1))) U32 unalign32;
155+
typedef __attribute__((aligned(1))) U64 unalign64;
162156

163-
MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
164-
MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
165-
MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
157+
MEM_STATIC U16 MEM_read16(const void* ptr) { return *(const unalign16*)ptr; }
158+
MEM_STATIC U32 MEM_read32(const void* ptr) { return *(const unalign32*)ptr; }
159+
MEM_STATIC U64 MEM_read64(const void* ptr) { return *(const unalign64*)ptr; }
166160

167-
MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
161+
MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(unalign16*)memPtr = value; }
168162

169163
#else
170164

lib/legacy/zstd_v04.c

+13-19
Original file line numberDiff line numberDiff line change
@@ -87,21 +87,15 @@ extern "C" {
8787
/****************************************************************
8888
* Memory I/O
8989
*****************************************************************/
90-
/* MEM_FORCE_MEMORY_ACCESS
91-
* By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
92-
* Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
93-
* The below switch allow to select different access method for improved performance.
94-
* Method 0 (default) : use `memcpy()`. Safe and portable.
95-
* Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
96-
* This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
90+
/* MEM_FORCE_MEMORY_ACCESS : For accessing unaligned memory:
91+
* Method 0 : always use `memcpy()`. Safe and portable.
92+
* Method 1 : Use compiler extension to set unaligned access.
9793
* Method 2 : direct access. This method is portable but violate C standard.
98-
* It can generate buggy code on targets generating assembly depending on alignment.
99-
* But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
100-
* See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
101-
* Prefer these methods in priority order (0 > 1 > 2)
94+
* It can generate buggy code on targets depending on alignment.
95+
* Default : method 1 if supported, else method 0
10296
*/
10397
#ifndef MEM_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */
104-
# if defined(__INTEL_COMPILER) || defined(__GNUC__) || defined(__ICCARM__)
98+
# ifdef __GNUC__
10599
# define MEM_FORCE_MEMORY_ACCESS 1
106100
# endif
107101
#endif
@@ -127,15 +121,15 @@ MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
127121

128122
#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)
129123

130-
/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
131-
/* currently only defined for gcc and icc */
132-
typedef union { U16 u16; U32 u32; U64 u64; } __attribute__((packed)) unalign;
124+
typedef __attribute__((aligned(1))) U16 unalign16;
125+
typedef __attribute__((aligned(1))) U32 unalign32;
126+
typedef __attribute__((aligned(1))) U64 unalign64;
133127

134-
MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
135-
MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
136-
MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
128+
MEM_STATIC U16 MEM_read16(const void* ptr) { return *(const unalign16*)ptr; }
129+
MEM_STATIC U32 MEM_read32(const void* ptr) { return *(const unalign32*)ptr; }
130+
MEM_STATIC U64 MEM_read64(const void* ptr) { return *(const unalign64*)ptr; }
137131

138-
MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
132+
MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(unalign16*)memPtr = value; }
139133

140134
#else
141135

lib/legacy/zstd_v05.c

+14-20
Original file line numberDiff line numberDiff line change
@@ -106,21 +106,15 @@ extern "C" {
106106
/*-**************************************************************
107107
* Memory I/O
108108
*****************************************************************/
109-
/* MEM_FORCE_MEMORY_ACCESS :
110-
* By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
111-
* Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
112-
* The below switch allow to select different access method for improved performance.
113-
* Method 0 (default) : use `memcpy()`. Safe and portable.
114-
* Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
115-
* This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
109+
/* MEM_FORCE_MEMORY_ACCESS : For accessing unaligned memory:
110+
* Method 0 : always use `memcpy()`. Safe and portable.
111+
* Method 1 : Use compiler extension to set unaligned access.
116112
* Method 2 : direct access. This method is portable but violate C standard.
117113
* It can generate buggy code on targets depending on alignment.
118-
* In some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
119-
* See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
120-
* Prefer these methods in priority order (0 > 1 > 2)
114+
* Default : method 1 if supported, else method 0
121115
*/
122116
#ifndef MEM_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */
123-
# if defined(__INTEL_COMPILER) || defined(__GNUC__) || defined(__ICCARM__)
117+
# ifdef __GNUC__
124118
# define MEM_FORCE_MEMORY_ACCESS 1
125119
# endif
126120
#endif
@@ -148,17 +142,17 @@ MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(U64*)memPtr = value; }
148142

149143
#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)
150144

151-
/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
152-
/* currently only defined for gcc and icc */
153-
typedef union { U16 u16; U32 u32; U64 u64; size_t st; } __attribute__((packed)) unalign;
145+
typedef __attribute__((aligned(1))) U16 unalign16;
146+
typedef __attribute__((aligned(1))) U32 unalign32;
147+
typedef __attribute__((aligned(1))) U64 unalign64;
154148

155-
MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
156-
MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
157-
MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
149+
MEM_STATIC U16 MEM_read16(const void* ptr) { return *(const unalign16*)ptr; }
150+
MEM_STATIC U32 MEM_read32(const void* ptr) { return *(const unalign32*)ptr; }
151+
MEM_STATIC U64 MEM_read64(const void* ptr) { return *(const unalign64*)ptr; }
158152

159-
MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
160-
MEM_STATIC void MEM_write32(void* memPtr, U32 value) { ((unalign*)memPtr)->u32 = value; }
161-
MEM_STATIC void MEM_write64(void* memPtr, U64 value) { ((unalign*)memPtr)->u64 = value; }
153+
MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(unalign16*)memPtr = value; }
154+
MEM_STATIC void MEM_write32(void* memPtr, U32 value) { *(unalign32*)memPtr = value; }
155+
MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(unalign64*)memPtr = value; }
162156

163157
#else
164158

0 commit comments

Comments
 (0)