Skip to content

Commit 11173c8

Browse files
authored
Fixes discard_memory compilation failure for pre-Volta (#637)
* moves discard_memory to its own header * fixes initialization order in access property * fixes pointer alignment computation * fixes includes and license
1 parent 445d531 commit 11173c8

File tree

4 files changed

+48
-22
lines changed

4 files changed

+48
-22
lines changed

libcudacxx/docs/extended_api/memory_access_properties/discard_memory.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ Does **not** generate any HW store operations.
2323
This kernel needs a scratch pad that does not fit in shared memory, so it uses an allocation in global memory instead:
2424

2525
```cuda
26-
#include <cuda/annotated_ptr>
26+
#include <cuda/discard_memory>
2727
__device__ int compute(int* scratch, size_t N);
2828
2929
__global__ void kernel(int const* in, int* out, int* scratch, size_t N) {

libcudacxx/include/cuda/annotated_ptr

+1-19
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656

5757
#include <cuda/std/cstdint>
5858
#include <cuda/barrier>
59+
#include <cuda/discard_memory>
5960

6061
#include "std/detail/__access_property"
6162

@@ -171,25 +172,6 @@ void apply_access_property(const volatile void* __ptr, const _Shape __shape, acc
171172
))
172173
}
173174

174-
inline
175-
_LIBCUDACXX_HOST_DEVICE
176-
void discard_memory(volatile void* __ptr, std::size_t __nbytes) noexcept {
177-
NV_IF_TARGET(NV_PROVIDES_SM_80,(
178-
if (!__isGlobal((void*)__ptr)) return;
179-
180-
char* __p = reinterpret_cast<char*>(const_cast<void*>(__ptr));
181-
static constexpr std::size_t _LINE_SIZE = 128;
182-
std::size_t __start = (reinterpret_cast<std::uintptr_t>(__p) % _LINE_SIZE) ? 1 : 0;
183-
std::size_t __end = (reinterpret_cast<std::uintptr_t>(__p + __nbytes) % _LINE_SIZE) ? __nbytes - _LINE_SIZE : __nbytes;
184-
__end /= _LINE_SIZE;
185-
186-
//Trim the first block and last block if they're not 128 bytes aligned
187-
for (std::size_t __i = __start; __i < __end; __i += _LINE_SIZE) {
188-
asm volatile ("discard.global.L2 [%0], 128;" ::"l"(__p + (__i * _LINE_SIZE)) :);
189-
}
190-
))
191-
}
192-
193175
template<class _Tp, class _Property>
194176
class annotated_ptr: public __detail_ap::__annotated_ptr_base<_Property> {
195177
public:
+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of libcu++, the C++ Standard Library for your entire system,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#ifndef _CUDA_DISCARD_MEMORY
12+
#define _CUDA_DISCARD_MEMORY
13+
14+
#include <cuda/std/cstdint>
15+
#include <cuda/std/detail/__config>
16+
17+
_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
18+
19+
inline _LIBCUDACXX_HOST_DEVICE void discard_memory(volatile void* __ptr, std::size_t __nbytes) noexcept
20+
{
21+
NV_IF_TARGET(
22+
NV_PROVIDES_SM_80,
23+
(if (!__isGlobal((void*) __ptr)) return;
24+
25+
char* __p = reinterpret_cast<char*>(const_cast<void*>(__ptr));
26+
char* const __end_p = __p + __nbytes;
27+
static constexpr std::size_t _LINE_SIZE = 128;
28+
29+
// Trim the first block and last block if they're not 128 bytes aligned
30+
std::size_t __misalignment = reinterpret_cast<std::uintptr_t>(__p) % _LINE_SIZE;
31+
char* __start_aligned = __misalignment == 0 ? __p : __p + (_LINE_SIZE - __misalignment);
32+
char* const __end_aligned = __end_p - (reinterpret_cast<std::uintptr_t>(__end_p) % _LINE_SIZE);
33+
34+
while (__start_aligned < __end_aligned) {
35+
printf("-> [%p, %p)\n", __start_aligned, __start_aligned + 128);
36+
asm volatile("discard.global.L2 [%0], 128;" ::"l"(__start_aligned) :);
37+
__start_aligned += _LINE_SIZE;
38+
}),
39+
((void) (__ptr); (void) (__nbytes);))
40+
}
41+
42+
_LIBCUDACXX_END_NAMESPACE_CUDA
43+
44+
#endif

libcudacxx/include/cuda/std/detail/__access_property

+2-2
Original file line numberDiff line numberDiff line change
@@ -239,13 +239,13 @@ namespace __detail_ap {
239239
__on::__l2_cop_on_t __hit_prop,
240240
std::uint32_t __hit_ratio,
241241
__off::__l2_cop_off_t __miss_prop) noexcept
242-
: __fraction(__hit_ratio),
242+
: __ap_reserved(0x0),
243+
__fraction(__hit_ratio),
243244
__l2_cop_off(__miss_prop),
244245
__l2_cop_on(__hit_prop),
245246
__l2_descriptor_mode(_DESC_INTERLEAVED),
246247
__l1_inv_dont_allocate(0x0),
247248
__l2_sector_promote_256B(0x0),
248-
__ap_reserved(0x0),
249249
__ap_reserved2(0x0) {}
250250

251251
_LIBCUDACXX_HOST_DEVICE

0 commit comments

Comments
 (0)