Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

jl_gc_calloc_aligned and friends: obtaining aligned zero initialized memory efficiently #42704

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 100 additions & 2 deletions src/gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -3461,8 +3461,8 @@ JL_DLLEXPORT void *jl_malloc(size_t sz)
return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
}

JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz)
{
//_unchecked_calloc does not check for potential overflow of nm*sz
static inline void *_unchecked_calloc(size_t nm, size_t sz) {
size_t nmsz = nm*sz;
int64_t *p = (int64_t *)jl_gc_counted_calloc(nmsz + JL_SMALL_BYTE_ALIGNMENT, 1);
if (p == NULL)
Expand All @@ -3471,6 +3471,13 @@ JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz)
return (void *)(p + 2); // assumes JL_SMALL_BYTE_ALIGNMENT == 16
}

JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz)
{
if (nm > SIZE_MAX/sz)
mkitti marked this conversation as resolved.
Show resolved Hide resolved
return NULL;
return _unchecked_calloc(nm, sz);
}

JL_DLLEXPORT void jl_free(void *p)
{
if (p != NULL) {
Expand Down Expand Up @@ -3526,6 +3533,97 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
return b;
}

JL_DLLEXPORT void *jl_gc_managed_calloc(size_t sz)
{
jl_ptls_t ptls = jl_current_task->ptls;
maybe_collect(ptls);
size_t allocsz = LLT_ALIGN(sz, JL_CACHE_BYTE_ALIGNMENT);
if (allocsz < sz) // overflow in adding offs, size was "negative"
jl_throw(jl_memory_exception);
int last_errno = errno;
#ifdef _OS_WINDOWS_
DWORD last_error = GetLastError();
#endif
//jl_gc_calloc_aligned allocations are tracked via jl_calloc
void *b = jl_gc_calloc_aligned(allocsz, 1, JL_CACHE_BYTE_ALIGNMENT);
if (b == NULL)
jl_throw(jl_memory_exception);
#ifdef _OS_WINDOWS_
SetLastError(last_error);
#endif
errno = last_errno;
return b;
}

// TODO add special casing for macOS, where there is guarantee of alignment
// - DONE, just use jl_malloc, jl_calloc, jl_realloc, jl_free since always aligned
// TODO add special casing for 64 bit systems when 16 byte alignment is requested
// TODO add checks on align?
// - Enforce posix_memalign reqs of power of 2 multiple of sizeof(void *)?
Comment on lines +3558 to +3562
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// TODO add special casing for macOS, where there is guarantee of alignment
// - DONE, just use jl_malloc, jl_calloc, jl_realloc, jl_free since always aligned
// TODO add special casing for 64 bit systems when 16 byte alignment is requested
// TODO add checks on align?
// - Enforce posix_memalign reqs of power of 2 multiple of sizeof(void *)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this may have come from previous rounds of review on prior iterations. If you don't mind, I'll keep this around for a bit longer until we have more eyes on this.

JL_DLLEXPORT void *jl_gc_malloc_aligned(size_t sz, size_t align)
{
#if defined(__APPLE__)
return jl_malloc(sz);
#endif
size_t offset = align - 1 + sizeof(void *) + sizeof(size_t);
void *p0 = jl_malloc(sz + offset);
if (!p0) return NULL;
void *p = (void *) (((uintptr_t) p0 + offset) & (~((uintptr_t) (align - 1))));
assert((uintptr_t) p >= (uintptr_t) p0 + sizeof(void *) + sizeof(size_t));
*((void **) p - 1) = p0;
*((size_t *) p - 2) = align;
return p;
}

JL_DLLEXPORT void *jl_gc_calloc_aligned(size_t nm, size_t sz, size_t align)
{
#if defined(__APPLE__)
return jl_calloc(nm, sz);
#endif
size_t offset = align - 1 + sizeof(void *) + sizeof(size_t);
if (nm > (SIZE_MAX-offset)/sz)
return NULL;
void *p0 = _unchecked_calloc(1, nm * sz + offset);
if (!p0) return NULL;
void *p = (void *) (((uintptr_t) p0 + offset) & (~((uintptr_t) (align - 1))));
assert((uintptr_t) p >= (uintptr_t) p0 + sizeof(void *) + sizeof(size_t));
*((void **) p - 1) = p0;
*((size_t *) p - 2) = align;
return p;
}

// TODO when resizing an array, you actually only need to memcpy the portion
// that is being used, which can have some savings
// How do we know what align is 16 bits on P64?
// See older jl_realloc_aligned, perhaps?
JL_DLLEXPORT void *jl_gc_realloc_aligned(void *p, size_t sz, size_t oldsz, size_t align)
{
#if defined(__APPLE__)
return jl_realloc(p, sz);
#endif
void *p0 = *((void **) p - 1);
size_t alignparam = *((size_t *) p - 2);
assert(alignparam == align);
assert(align > 0);
assert(p0);

void *pnew = jl_gc_malloc_aligned(sz, align);
if (pnew != NULL) {
memcpy(pnew, p, oldsz > sz ? sz : oldsz);
jl_gc_free_aligned(p);
}

return pnew;
}

JL_DLLEXPORT void jl_gc_free_aligned(void *p)
{
#if defined(__APPLE__)
return jl_free(p);
#endif
if (p) jl_free(*((void **) p - 1));
}

static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t oldsz,
int isaligned, jl_value_t *owner, int8_t can_collect)
{
Expand Down
5 changes: 5 additions & 0 deletions src/julia.h
Original file line number Diff line number Diff line change
Expand Up @@ -891,8 +891,13 @@ STATIC_INLINE void jl_gc_multi_wb(const void *parent, const jl_value_t *ptr) JL_
}

JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz);
JL_DLLEXPORT void *jl_gc_managed_calloc(size_t sz);
JL_DLLEXPORT void *jl_gc_managed_realloc(void *d, size_t sz, size_t oldsz,
int isaligned, jl_value_t *owner);
JL_DLLEXPORT void *jl_gc_malloc_aligned(size_t sz, size_t align);
JL_DLLEXPORT void *jl_gc_calloc_aligned(size_t nm, size_t sz, size_t align);
JL_DLLEXPORT void *jl_gc_realloc_aligned(void *p, size_t sz, size_t oldsz, size_t align);
JL_DLLEXPORT void jl_gc_free_aligned(void *p);

// object accessors -----------------------------------------------------------

Expand Down