55// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
66//
77// ===----------------------------------------------------------------------===//
8- // The functions defined in this file give approximate code size. These sizes
9- // assume the following configuration options:
10- // - LIBC_CONF_KEEP_FRAME_POINTER = false
11- // - LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR = false
12- // - LIBC_ADD_NULL_CHECKS = false
138#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
149#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
1510
1611#include " src/__support/macros/attributes.h" // LIBC_INLINE
1712#include " src/__support/macros/optimization.h" // LIBC_LOOP_NOUNROLL
18- #include " src/string/memory_utils/arm/common.h" // LIBC_ATTR_LIKELY, LIBC_ATTR_UNLIKELY
1913#include " src/string/memory_utils/utils.h" // memcpy_inline, distance_to_align
2014
2115#include < stddef.h> // size_t
2216
17+ // https://libc.llvm.org/compiler_support.html
18+ // Support for [[likely]] / [[unlikely]]
19+ // [X] GCC 12.2
20+ // [X] Clang 12
21+ // [ ] Clang 11
22+ #define LIBC_ATTR_LIKELY [[likely]]
23+ #define LIBC_ATTR_UNLIKELY [[unlikely]]
24+
25+ #if defined(LIBC_COMPILER_IS_CLANG)
26+ #if LIBC_COMPILER_CLANG_VER < 1200
27+ #undef LIBC_ATTR_LIKELY
28+ #undef LIBC_ATTR_UNLIKELY
29+ #define LIBC_ATTR_LIKELY
30+ #define LIBC_ATTR_UNLIKELY
31+ #endif
32+ #endif
33+
2334namespace LIBC_NAMESPACE_DECL {
2435
2536namespace {
2637
27- // Performs a copy of `bytes` byte from `src` to `dst`. This function has the
28- // semantics of `memcpy` where `src` and `dst` are `__restrict`. The compiler is
29- // free to use whatever instruction is best for the size and assumed access.
30- template <size_t bytes, AssumeAccess access>
31- LIBC_INLINE void copy (void *dst, const void *src) {
32- if constexpr (access == AssumeAccess::kAligned ) {
33- constexpr size_t alignment = bytes > kWordSize ? kWordSize : bytes;
34- memcpy_inline<bytes>(assume_aligned<alignment>(dst),
35- assume_aligned<alignment>(src));
36- } else if constexpr (access == AssumeAccess::kUnknown ) {
37- memcpy_inline<bytes>(dst, src);
38- } else {
39- static_assert (false );
40- }
41- }
38+ LIBC_INLINE_VAR constexpr size_t kWordSize = sizeof (uint32_t );
4239
43- template <size_t bytes, BlockOp block_op = BlockOp::kFull ,
44- AssumeAccess access = AssumeAccess::kUnknown >
45- LIBC_INLINE void copy_block_and_bump_pointers (Ptr &dst, CPtr &src) {
46- if constexpr (block_op == BlockOp::kFull ) {
47- copy<bytes, access>(dst, src);
48- } else if constexpr (block_op == BlockOp::kByWord ) {
40+ enum Strategy {
41+ ForceWordLdStChain,
42+ AssumeWordAligned,
43+ AssumeUnaligned,
44+ };
45+
46+ template <size_t bytes, Strategy strategy = AssumeUnaligned>
47+ LIBC_INLINE void copy_and_bump_pointers (Ptr &dst, CPtr &src) {
48+ if constexpr (strategy == AssumeUnaligned) {
49+ memcpy_inline<bytes>(assume_aligned<1 >(dst), assume_aligned<1 >(src));
50+ } else if constexpr (strategy == AssumeWordAligned) {
51+ static_assert (bytes >= kWordSize );
52+ memcpy_inline<bytes>(assume_aligned<kWordSize >(dst),
53+ assume_aligned<kWordSize >(src));
54+ } else if constexpr (strategy == ForceWordLdStChain) {
4955 // We restrict loads/stores to 4 byte to prevent the use of load/store
50- // multiple (LDM, STM) and load/store double (LDRD, STRD).
56+ // multiple (LDM, STM) and load/store double (LDRD, STRD). First, they may
57+ // fault (see notes below) and second, they use more registers which in turn
58+ // adds push/pop instructions in the hot path.
5159 static_assert ((bytes % kWordSize == 0 ) && (bytes >= kWordSize ));
5260 LIBC_LOOP_UNROLL
53- for (size_t offset = 0 ; offset < bytes; offset += kWordSize ) {
54- copy<kWordSize , access>(dst + offset, src + offset);
61+ for (size_t i = 0 ; i < bytes / kWordSize ; ++i) {
62+ const size_t offset = i * kWordSize ;
63+ memcpy_inline<kWordSize >(dst + offset, src + offset);
5564 }
56- } else {
57- static_assert (false , " Invalid BlockOp" );
5865 }
5966 // In the 1, 2, 4 byte copy case, the compiler can fold pointer offsetting
6067 // into the load/store instructions.
@@ -65,27 +72,39 @@ LIBC_INLINE void copy_block_and_bump_pointers(Ptr &dst, CPtr &src) {
6572 src += bytes;
6673}
6774
68- template < size_t bytes, BlockOp block_op, AssumeAccess access>
69- LIBC_INLINE void consume_by_block (Ptr &dst, CPtr &src, size_t & size) {
75+ LIBC_INLINE void copy_bytes_and_bump_pointers (Ptr &dst, CPtr &src,
76+ const size_t size) {
7077 LIBC_LOOP_NOUNROLL
71- for (size_t i = 0 ; i < size / bytes; ++i)
72- copy_block_and_bump_pointers<bytes, block_op, access>(dst, src);
73- size %= bytes;
78+ for (size_t i = 0 ; i < size; ++i)
79+ *dst++ = *src++;
7480}
7581
76- [[maybe_unused]] LIBC_INLINE void
77- copy_bytes_and_bump_pointers (Ptr &dst, CPtr &src, size_t size) {
82+ template <size_t block_size, Strategy strategy>
83+ LIBC_INLINE void copy_blocks_and_update_args (Ptr &dst, CPtr &src,
84+ size_t &size) {
7885 LIBC_LOOP_NOUNROLL
79- for (size_t i = 0 ; i < size; ++i)
80- *dst++ = *src++;
86+ for (size_t i = 0 ; i < size / block_size; ++i)
87+ copy_and_bump_pointers<block_size, strategy>(dst, src);
88+ // Update `size` once at the end instead of once per iteration.
89+ size %= block_size;
90+ }
91+
92+ LIBC_INLINE CPtr bitwise_or (CPtr a, CPtr b) {
93+ return cpp::bit_cast<CPtr>(cpp::bit_cast<uintptr_t >(a) |
94+ cpp::bit_cast<uintptr_t >(b));
95+ }
96+
97+ LIBC_INLINE auto misaligned (CPtr a) {
98+ return distance_to_align_down<kWordSize >(a);
8199}
82100
83101} // namespace
84102
85- // Implementation for Cortex-M0, M0+, M1 cores that do not allow for unaligned
86- // loads/stores. It compiles down to 208 bytes when used through `memcpy` that
87- // also needs to return the `dst` ptr.
88- // Note:
103+ // Implementation for Cortex-M0, M0+, M1.
104+ // Notes:
105+ // - It compiles down to 196 bytes, but 220 bytes when used through `memcpy`
106+ // that also needs to return the `dst` ptr.
107+ // - These cores do not allow for unaligned loads/stores.
89108// - When `src` and `dst` are coaligned, we start by aligning them and perform
90109// bulk copies. We let the compiler know the pointers are aligned so it can
91110// use load/store multiple (LDM, STM). This significantly increase throughput
@@ -106,18 +125,9 @@ copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src, size_t size) {
106125 if (src_alignment == 0 )
107126 LIBC_ATTR_LIKELY {
108127 // Both `src` and `dst` are now word-aligned.
109- // We first copy by blocks of 64 bytes, the compiler will use 4
110- // load/store multiple (LDM, STM), each of 4 words. This requires more
111- // registers so additional push/pop are needed but the speedup is worth
112- // it.
113- consume_by_block<64 , BlockOp::kFull , AssumeAccess::kAligned >(dst, src,
114- size);
115- // Then we use blocks of 4 word load/store.
116- consume_by_block<16 , BlockOp::kByWord , AssumeAccess::kAligned >(dst, src,
117- size);
118- // Then we use word by word copy.
119- consume_by_block<4 , BlockOp::kByWord , AssumeAccess::kAligned >(dst, src,
120- size);
128+ copy_blocks_and_update_args<64 , AssumeWordAligned>(dst, src, size);
129+ copy_blocks_and_update_args<16 , AssumeWordAligned>(dst, src, size);
130+ copy_blocks_and_update_args<4 , AssumeWordAligned>(dst, src, size);
121131 }
122132 else {
123133 // `dst` is aligned but `src` is not.
@@ -128,7 +138,7 @@ copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src, size_t size) {
128138 src_alignment == 2
129139 ? load_aligned<uint32_t , uint16_t , uint16_t >(src)
130140 : load_aligned<uint32_t , uint8_t , uint16_t , uint8_t >(src);
131- copy <kWordSize , AssumeAccess:: kAligned >( dst, &value);
141+ memcpy_inline <kWordSize >(assume_aligned< kWordSize >( dst) , &value);
132142 dst += kWordSize ;
133143 src += kWordSize ;
134144 size -= kWordSize ;
@@ -141,69 +151,56 @@ copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src, size_t size) {
141151}
142152
143153// Implementation for Cortex-M3, M4, M7, M23, M33, M35P, M52 with hardware
144- // support for unaligned loads and stores. It compiles down to 272 bytes when
145- // used through `memcpy` that also needs to return the `dst` ptr.
154+ // support for unaligned loads and stores.
155+ // Notes:
156+ // - It compiles down to 266 bytes.
157+ // - `dst` and `src` are not `__restrict` to prevent the compiler from
158+ // reordering loads/stores.
159+ // - We keep state variables to a strict minimum to keep everything in the free
160+ // registers and prevent costly push / pop.
161+ // - If unaligned single loads/stores to normal memory are supported, unaligned
162+ // accesses for load/store multiple (LDM, STM) and load/store double (LDRD,
163+ // STRD) instructions are generally not supported and will still fault so we
164+ // make sure to restrict unrolling to word loads/stores.
146165[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm_mid_end (Ptr dst, CPtr src,
147166 size_t size) {
148167 if (misaligned (bitwise_or (src, dst)))
149168 LIBC_ATTR_UNLIKELY {
150169 if (size < 8 )
151170 LIBC_ATTR_UNLIKELY {
152171 if (size & 1 )
153- copy_block_and_bump_pointers <1 >(dst, src);
172+ copy_and_bump_pointers <1 >(dst, src);
154173 if (size & 2 )
155- copy_block_and_bump_pointers <2 >(dst, src);
174+ copy_and_bump_pointers <2 >(dst, src);
156175 if (size & 4 )
157- copy_block_and_bump_pointers <4 >(dst, src);
176+ copy_and_bump_pointers <4 >(dst, src);
158177 return ;
159178 }
160179 if (misaligned (src))
161180 LIBC_ATTR_UNLIKELY {
162181 const size_t offset = distance_to_align_up<kWordSize >(dst);
163182 if (offset & 1 )
164- copy_block_and_bump_pointers <1 >(dst, src);
183+ copy_and_bump_pointers <1 >(dst, src);
165184 if (offset & 2 )
166- copy_block_and_bump_pointers <2 >(dst, src);
185+ copy_and_bump_pointers <2 >(dst, src);
167186 size -= offset;
168187 }
169188 }
170- // `dst` and `src` are not necessarily both aligned at that point but this
171- // implementation assumes hardware support for unaligned loads and stores so
172- // it is still fast to perform unrolled word by word copy. Note that wider
173- // accesses through the use of load/store multiple (LDM, STM) and load/store
174- // double (LDRD, STRD) instructions are generally not supported and can fault.
175- // By forcing decomposition of 64 bytes copy into word by word copy, the
176- // compiler can use the first load to prefetch memory:
177- // ldr r3, [r1, #64]! <- prefetch next cache line
178- // str r3, [r0]
179- // ldr r3, [r1, #0x4]
180- // str r3, [r0, #0x4]
181- // ...
182- // ldr r3, [r1, #0x3c]
183- // str r3, [r0, #0x3c]
184- // This is a bit detrimental for sizes between 64 and 256 (less than 10%
185- // penalty) but the prefetch yields better throughput for larger copies.
186- consume_by_block<64 , BlockOp::kByWord , AssumeAccess::kUnknown >(dst, src,
187- size);
188- consume_by_block<16 , BlockOp::kByWord , AssumeAccess::kUnknown >(dst, src,
189- size);
190- consume_by_block<4 , BlockOp::kByWord , AssumeAccess::kUnknown >(dst, src, size);
189+ copy_blocks_and_update_args<64 , ForceWordLdStChain>(dst, src, size);
190+ copy_blocks_and_update_args<16 , ForceWordLdStChain>(dst, src, size);
191+ copy_blocks_and_update_args<4 , AssumeUnaligned>(dst, src, size);
191192 if (size & 1 )
192- copy_block_and_bump_pointers <1 >(dst, src);
193+ copy_and_bump_pointers <1 >(dst, src);
193194 if (size & 2 )
194- copy_block_and_bump_pointers<2 >(dst, src);
195+ LIBC_ATTR_UNLIKELY
196+ copy_and_bump_pointers<2 >(dst, src);
195197}
196198
197- [[maybe_unused]] LIBC_INLINE void inline_memcpy_arm (Ptr dst, CPtr src,
199+ [[maybe_unused]] LIBC_INLINE void inline_memcpy_arm (void *__restrict dst_,
200+ const void *__restrict src_,
198201 size_t size) {
199- // The compiler performs alias analysis and is able to prove that `dst` and
200- // `src` do not alias by propagating the `__restrict` keyword from the
201- // `memcpy` prototype. This allows the compiler to merge consecutive
202- // load/store (LDR, STR) instructions generated in
203- // `copy_block_and_bump_pointers` with `BlockOp::kByWord` into load/store
204- // double (LDRD, STRD) instructions, this is is undesirable so we prevent the
205- // compiler from inferring `__restrict` with the following line.
206- asm volatile (" " : " +r" (dst), " +r" (src));
202+ Ptr dst = cpp::bit_cast<Ptr>(dst_);
203+ CPtr src = cpp::bit_cast<CPtr>(src_);
207204#ifdef __ARM_FEATURE_UNALIGNED
208205 return inline_memcpy_arm_mid_end (dst, src, size);
209206#else
@@ -213,4 +210,8 @@ copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src, size_t size) {
213210
214211} // namespace LIBC_NAMESPACE_DECL
215212
213+ // Cleanup local macros
214+ #undef LIBC_ATTR_LIKELY
215+ #undef LIBC_ATTR_UNLIKELY
216+
216217#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
0 commit comments