From 8533a1c40a62e2ae874ae4e3eb6719af24aa34e8 Mon Sep 17 00:00:00 2001
From: Yichao Yu <yyc1992@gmail.com>
Date: Sun, 5 Jun 2016 02:09:22 -0400
Subject: [PATCH] Implement custom memory manager for LLVM

Use various ways to reuse the page that has the page protection set in order
to avoid wasting a page of memory for each JIT event.

Fixes #14626
---
 src/Makefile           |   2 +-
 src/cgmemmgr.cpp       | 860 +++++++++++++++++++++++++++++++++++++++++
 src/codegen.cpp        |   3 -
 src/codegen_internal.h |  22 +-
 src/debuginfo.cpp      |  55 ++-
 src/jitlayers.cpp      |  47 +--
 6 files changed, 923 insertions(+), 66 deletions(-)
 create mode 100644 src/cgmemmgr.cpp

diff --git a/src/Makefile b/src/Makefile
index c80c3c4497b44..1617281c73eeb 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -44,7 +44,7 @@ SRCS := \
 LLVMLINK :=
 
 ifeq ($(JULIACODEGEN),LLVM)
-SRCS += codegen disasm debuginfo llvm-simdloop llvm-gcroot
+SRCS += codegen disasm debuginfo llvm-simdloop llvm-gcroot cgmemmgr
 FLAGS += -I$(shell $(LLVM_CONFIG_HOST) --includedir)
 LLVM_LIBS := all
 ifeq ($(USE_POLLY),1)
diff --git a/src/cgmemmgr.cpp b/src/cgmemmgr.cpp
new file mode 100644
index 0000000000000..c1332f87e52a4
--- /dev/null
+++ b/src/cgmemmgr.cpp
@@ -0,0 +1,860 @@
+// This file is a part of Julia. License is MIT: http://julialang.org/license
+
+#include "llvm-version.h"
+#include "platform.h"
+#include "options.h"
+
+#ifdef USE_MCJIT
+#include <llvm/ExecutionEngine/SectionMemoryManager.h>
+#include "julia.h"
+#include "julia_internal.h"
+
+#ifdef LLVM37
+#ifndef LLVM38
+#  include <llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h>
+#endif
+#ifdef _OS_LINUX_
+#  include <sys/syscall.h>
+#  ifdef __NR_memfd_create
+#    include <linux/memfd.h>
+#  endif
+#endif
+#ifndef _OS_WINDOWS_
+#  include <sys/mman.h>
+#  include <sys/stat.h>
+#  include <fcntl.h>
+#  include <unistd.h>
+#  if defined(_OS_DARWIN_) && !defined(MAP_ANONYMOUS)
+#    define MAP_ANONYMOUS MAP_ANON
+#  endif
+#endif
+#ifdef _OS_FREEBSD_
+#  include <sys/types.h>
+#endif
+
+namespace {
+
+static size_t get_block_size(size_t size)
+{
+    return (size > jl_page_size * 256 ? LLT_ALIGN(size, jl_page_size) :
+            jl_page_size * 256);
+}
+
+// Wrapper function to mmap/munmap/mprotect pages...
+static void *map_anon_page(size_t size)
+{
+#ifdef _OS_WINDOWS_
+    char *mem = (char*)VirtualAlloc(NULL, size + jl_page_size,
+                                    MEM_COMMIT, PAGE_READWRITE);
+    assert(mem && "Cannot allocate RW memory");
+    mem = (char*)LLT_ALIGN(uintptr_t(mem), jl_page_size);
+#else // _OS_WINDOWS_
+    void *mem = mmap(nullptr, size, PROT_READ | PROT_WRITE,
+                     MAP_NORESERVE | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    assert(mem != MAP_FAILED && "Cannot allocate RW memory");
+#endif // _OS_WINDOWS_
+    return mem;
+}
+
+static void unmap_page(void *ptr, size_t size)
+{
+#ifdef _OS_WINDOWS_
+    VirtualFree(ptr, size, MEM_DECOMMIT);
+#else // _OS_WINDOWS_
+    munmap(ptr, size);
+#endif // _OS_WINDOWS_
+}
+
+#ifdef _OS_WINDOWS_
+enum class Prot : int {
+    RW = PAGE_READWRITE,
+    RX = PAGE_EXECUTE,
+    RO = PAGE_READONLY
+};
+
+static void protect_page(void *ptr, size_t size, Prot flags)
+{
+    DWORD old_prot;
+    if (!VirtualProtect(ptr, size, (DWORD)flags, &old_prot)) {
+        jl_safe_printf("Cannot protect page @%p of size %u to 0x%x (err 0x%x)\n",
+                       ptr, (unsigned)size, (unsigned)flags,
+                       (unsigned)GetLastError());
+        abort();
+    }
+}
+#else // _OS_WINDOWS_
+enum class Prot : int {
+    RW = PROT_READ | PROT_WRITE,
+    RX = PROT_READ | PROT_EXEC,
+    RO = PROT_READ
+};
+
+static void protect_page(void *ptr, size_t size, Prot flags)
+{
+    int ret = mprotect(ptr, size, (int)flags);
+    if (ret != 0) {
+        perror(__func__);
+        abort();
+    }
+}
+
+static bool check_fd_or_close(int fd)
+{
+    if (fd == -1)
+        return false;
+    // This can fail due to `noexec` mount option ....
+    fcntl(fd, F_SETFD, FD_CLOEXEC);
+    fchmod(fd, S_IRWXU);
+    ftruncate(fd, jl_page_size);
+    void *ptr = mmap(nullptr, jl_page_size, PROT_READ | PROT_EXEC,
+                     MAP_SHARED, fd, 0);
+    if (ptr == MAP_FAILED) {
+        close(fd);
+        return false;
+    }
+    munmap(ptr, jl_page_size);
+    return true;
+}
+#endif // _OS_WINDOWS_
+
+static intptr_t anon_hdl = -1;
+
+#ifdef _OS_WINDOWS_
+// As far as I can tell `CreateFileMapping` cannot be resized on windows.
+// Also, creating big file mapping and then map pieces of it seems to
+// consume too much global resources. Therefore, we use each file mapping
+// as a block on windows
+static void *create_shared_map(size_t size, size_t id)
+{
+    void *addr = MapViewOfFile((HANDLE)id, FILE_MAP_ALL_ACCESS,
+                               0, 0, size);
+    assert(addr && "Cannot map RW view");
+    return addr;
+}
+
+static intptr_t init_shared_map()
+{
+    anon_hdl = 0;
+    return 0;
+}
+
+static void *alloc_shared_page(size_t size, size_t *id, bool exec)
+{
+    assert(size % jl_page_size == 0);
+    auto file_mode = exec ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE;
+    HANDLE hdl = CreateFileMapping(INVALID_HANDLE_VALUE, NULL,
+                                   file_mode, 0, size, NULL);
+    *id = (size_t)hdl;
+    auto map_mode = FILE_MAP_READ | (exec ? FILE_MAP_EXECUTE : 0);
+    void *addr = MapViewOfFile(hdl, map_mode, 0, 0, size);
+    assert(addr && "Cannot map RO view");
+    return addr;
+}
+#else // _OS_WINDOWS_
+// For shared mapped region
+static intptr_t get_anon_hdl(void)
+{
+    int fd = -1;
+
+    // Linux and FreeBSD can create an anonymous fd without touching the
+    // file system.
+#  ifdef __NR_memfd_create
+    fd = syscall(__NR_memfd_create, "julia-codegen", MFD_CLOEXEC);
+    if (check_fd_or_close(fd))
+        return fd;
+#  endif
+#  ifdef _OS_FREEBSD_
+    fd = shm_open(SHM_ANON, O_RDWR, S_IRWXU);
+    if (check_fd_or_close(fd))
+        return fd;
+#  endif
+    char shm_name[] = "julia-codegen-0123456789-0123456789/tmp///";
+    pid_t pid = getpid();
+    // `shm_open` can't be mapped exec on mac
+#  ifndef _OS_DARWIN_
+    do {
+        snprintf(shm_name, sizeof(shm_name),
+                 "julia-codegen-%d-%d", (int)pid, rand());
+        fd = shm_open(shm_name, O_RDWR | O_CREAT | O_EXCL, S_IRWXU);
+        if (check_fd_or_close(fd)) {
+            shm_unlink(shm_name);
+            return fd;
+        }
+    } while (errno == EEXIST);
+#  endif
+    FILE *tmpf = tmpfile();
+    if (tmpf) {
+        fd = dup(fileno(tmpf));
+        fclose(tmpf);
+        if (check_fd_or_close(fd)) {
+            return fd;
+        }
+    }
+    snprintf(shm_name, sizeof(shm_name),
+             "/tmp/julia-codegen-%d-XXXXXX", (int)pid);
+    fd = mkstemp(shm_name);
+    if (check_fd_or_close(fd)) {
+        unlink(shm_name);
+        return fd;
+    }
+    return -1;
+}
+
+static size_t map_offset = 0;
+// Multiple of 128MB.
+// Hopefully no one will set a ulimit for this to be a problem...
+static constexpr size_t map_size_inc = 128 * 1024 * 1024;
+static size_t map_size = 0;
+static jl_mutex_t shared_map_lock;
+
+static void *create_shared_map(size_t size, size_t id)
+{
+    void *addr = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED,
+                      anon_hdl, id);
+    assert(addr != MAP_FAILED && "Cannot map RW view");
+    return addr;
+}
+
+static intptr_t init_shared_map()
+{
+    anon_hdl = get_anon_hdl();
+    if (anon_hdl == -1)
+        return -1;
+    map_offset = 0;
+    map_size = map_size_inc;
+    int ret = ftruncate(anon_hdl, map_size);
+    if (ret != 0) {
+        perror(__func__);
+        abort();
+    }
+    return anon_hdl;
+}
+
+static void *alloc_shared_page(size_t size, size_t *id, bool exec)
+{
+    assert(size % jl_page_size == 0);
+    size_t off = jl_atomic_fetch_add(&map_offset, size);
+    *id = off;
+    if (__unlikely(off + size > map_size)) {
+        JL_LOCK_NOGC(&shared_map_lock);
+        size_t old_size = map_size;
+        while (off + size > map_size)
+            map_size += map_size_inc;
+        if (old_size != map_size) {
+            int ret = ftruncate(anon_hdl, map_size);
+            if (ret != 0) {
+                perror(__func__);
+                abort();
+            }
+        }
+        JL_UNLOCK_NOGC(&shared_map_lock);
+    }
+    return create_shared_map(size, off);
+}
+#endif // _OS_WINDOWS_
+
+#ifdef _OS_LINUX_
+// Using `/proc/self/mem`, A.K.A. Keno's remote memory manager.
+
+static int self_mem_fd = -1;
+
+static int init_self_mem()
+{
+    int fd = open("/proc/self/mem", O_RDWR | O_SYNC | O_CLOEXEC);
+    if (fd == -1)
+        return -1;
+    // buffer to check if write works;
+    volatile uint64_t buff = 0;
+    uint64_t v = 0x12345678;
+    int ret = pwrite(fd, (void*)&v, sizeof(uint64_t), (uintptr_t)&buff);
+    if (ret != sizeof(uint64_t) || buff != 0x12345678) {
+        close(fd);
+        return -1;
+    }
+    self_mem_fd = fd;
+    return fd;
+}
+
+static void write_self_mem(void *dest, void *ptr, size_t size)
+{
+    while (size > 0) {
+        ssize_t ret = pwrite(self_mem_fd, ptr, size, (uintptr_t)dest);
+        if (ret == size)
+            return;
+        if (ret == -1 && (errno == EAGAIN || errno == EINTR))
+            continue;
+        assert(ret < size);
+        size -= ret;
+        ptr = (char*)ptr + ret;
+        dest = (char*)dest + ret;
+    }
+}
+#endif // _OS_LINUX_
+
+using namespace llvm;
+
+// Allocation strategies
+// * For RW data, no memory protection needed, use plain memory pool.
+// * For RO data or code,
+//
+//   The first allocation in the page always has write address equals to
+//   runtime address.
+//
+//   1. shared dual map
+//
+//       Map an (unlinked) anonymous file as memory pool.
+//       After first allocation, write address points to the second map.
+//       The second map is set to unreadable and unwritable in finalization.
+//
+//   2. private dual map
+//
+//       Same as above but use anonymous memory map as memory pool,
+//       and use low level OS api to set up the second map.
+//
+//   3. copying data into RO page bypassing page protection
+//
+//       After first allocation, write address points to a temporary buffer.
+//       Requires copying data out of the temporary buffer in finalization.
+
+// Allocates at least 256 pages per block and keep up to 8 blocks in the free
+// list. The block with the least free space is discarded when we need to
+// allocate a new page.
+// Unused full pages are free'd from the block before discarding so at most
+// one page is wasted on each discarded blocks. There should be at most one
+// block with more than 128 pages available so the discarded one must have
+// less than 128 pages available and therefore at least 128 pages used.
+// (Apart from fragmentation) this guarantees less than 1% of memory is wasted.
+
+// the `shared` type parameter is for Windows only....
+struct Block {
+    // runtime address
+    char *ptr{nullptr};
+    size_t total{0};
+    size_t avail{0};
+
+    Block(const Block&) = delete;
+    Block &operator=(const Block&) = delete;
+
+    Block() = default;
+
+    void *alloc(size_t size, size_t align)
+    {
+        size_t aligned_avail = avail & (-align);
+        if (aligned_avail < size)
+            return nullptr;
+        char *p = ptr + total - aligned_avail;
+        avail = aligned_avail - size;
+        return p;
+    }
+    void reset(void *addr, size_t size)
+    {
+        if (avail >= jl_page_size) {
+            uintptr_t end = uintptr_t(ptr) + total;
+            uintptr_t first_free = end - avail;
+            first_free = LLT_ALIGN(first_free, jl_page_size);
+            assert(first_free < end);
+            unmap_page((void*)first_free, end - first_free);
+        }
+        ptr = (char*)addr;
+        total = avail = size;
+    }
+};
+
+class RWAllocator {
+    static constexpr int nblocks = 8;
+    Block blocks[nblocks]{};
+public:
+    void *alloc(size_t size, size_t align)
+    {
+        size_t min_size = (size_t)-1;
+        int min_id = 0;
+        for (int i = 0;i < nblocks && blocks[i].ptr;i++) {
+            if (void *ptr = blocks[i].alloc(size, align))
+                return ptr;
+            if (blocks[i].avail < min_size) {
+                min_size = blocks[i].avail;
+                min_id = i;
+            }
+        }
+        size_t block_size = get_block_size(size);
+        blocks[min_id].reset(map_anon_page(block_size), block_size);
+        return blocks[min_id].alloc(size, align);
+    }
+};
+
+struct SplitPtrBlock : public Block {
+    // Possible states
+    // Allocation:
+    // * Initial allocation: `state & InitAlloc`
+    // * Followup allocation: `(state & Alloc) && !(state & InitAlloc)`
+    enum State {
+        // This block has no page protection set yet
+        InitAlloc = (1 << 0),
+        // There is at least one allocation in this page since last finalization
+        Alloc = (1 << 1),
+        // `wr_ptr` can be directly used as write address.
+        WRInit = (1 << 2),
+        // With `WRInit` set, whether `wr_ptr` has write permission enabled.
+        WRReady = (1 << 3),
+    };
+
+    uintptr_t wr_ptr{0};
+    uint32_t state{0};
+    SplitPtrBlock() = default;
+
+    void swap(SplitPtrBlock &other)
+    {
+        std::swap(ptr, other.ptr);
+        std::swap(total, other.total);
+        std::swap(avail, other.avail);
+        std::swap(wr_ptr, other.wr_ptr);
+        std::swap(state, other.state);
+    }
+
+    SplitPtrBlock(SplitPtrBlock &&other)
+        : SplitPtrBlock()
+    {
+        swap(other);
+    }
+};
+
+struct Allocation {
+    // Address to write to (the one returned by the allocation function)
+    void *wr_addr;
+    // Runtime address
+    void *rt_addr;
+    size_t sz;
+    bool relocated;
+};
+
+template<bool exec>
+class ROAllocator {
+protected:
+    static constexpr int nblocks = 8;
+    SplitPtrBlock blocks[nblocks];
+    // Blocks that are done allocating (removed from `blocks`)
+    // but might not have all the permissions set or data copied yet.
+    SmallVector<SplitPtrBlock, 16> completed;
+    virtual void *get_wr_ptr(SplitPtrBlock &block, void *rt_ptr,
+                             size_t size, size_t align) = 0;
+    virtual SplitPtrBlock alloc_block(size_t size) = 0;
+public:
+    virtual ~ROAllocator() {}
+    virtual void finalize()
+    {
+        if (exec) {
+            for (auto &alloc: allocations) {
+                sys::Memory::InvalidateInstructionCache(alloc.rt_addr,
+                                                        alloc.sz);
+            }
+        }
+        completed.clear();
+        allocations.clear();
+    }
+    // Allocations that have not been finalized yet.
+    SmallVector<Allocation, 16> allocations;
+    void *alloc(size_t size, size_t align)
+    {
+        size_t min_size = (size_t)-1;
+        int min_id = 0;
+        for (int i = 0;i < nblocks && blocks[i].ptr;i++) {
+            auto &block = blocks[i];
+            void *ptr = block.alloc(size, align);
+            if (ptr) {
+                void *wr_ptr;
+                if (block.state & SplitPtrBlock::InitAlloc) {
+                    wr_ptr = ptr;
+                }
+                else {
+                    wr_ptr = get_wr_ptr(block, ptr, size, align);
+                }
+                block.state |= SplitPtrBlock::Alloc;
+                allocations.push_back(Allocation{wr_ptr, ptr, size, false});
+                return wr_ptr;
+            }
+            if (block.avail < min_size) {
+                min_size = block.avail;
+                min_id = i;
+            }
+        }
+        size_t block_size = get_block_size(size);
+        auto &block = blocks[min_id];
+        auto new_block = alloc_block(block_size);
+        block.swap(new_block);
+        if (new_block.state) {
+            completed.push_back(std::move(new_block));
+        }
+        else {
+            new_block.reset(nullptr, 0);
+        }
+        void *ptr = block.alloc(size, align);
+#ifdef _OS_WINDOWS_
+        block.state = SplitPtrBlock::Alloc;
+        void *wr_ptr = get_wr_ptr(block, ptr, size, align);
+        allocations.push_back(Allocation{wr_ptr, ptr, size, false});
+        ptr = wr_ptr;
+#else
+        block.state = SplitPtrBlock::Alloc | SplitPtrBlock::InitAlloc;
+        allocations.push_back(Allocation{ptr, ptr, size, false});
+#endif
+        return ptr;
+    }
+};
+
+template<bool exec>
+class DualMapAllocator : public ROAllocator<exec> {
+protected:
+    void *get_wr_ptr(SplitPtrBlock &block, void *rt_ptr, size_t, size_t) override
+    {
+        assert((char*)rt_ptr >= block.ptr &&
+               (char*)rt_ptr < (block.ptr + block.total));
+        if (!(block.state & SplitPtrBlock::WRInit)) {
+            block.wr_ptr = (uintptr_t)create_shared_map(block.total,
+                                                        block.wr_ptr);
+            block.state |= SplitPtrBlock::WRInit;
+        }
+        if (!(block.state & SplitPtrBlock::WRReady)) {
+            protect_page((void*)block.wr_ptr, block.total, Prot::RW);
+            block.state |= SplitPtrBlock::WRReady;
+        }
+        return (char*)rt_ptr + (block.wr_ptr - uintptr_t(block.ptr));
+    }
+    SplitPtrBlock alloc_block(size_t size) override
+    {
+        SplitPtrBlock new_block;
+        // use `wr_ptr` to record the id initially
+        auto ptr = alloc_shared_page(size, (size_t*)&new_block.wr_ptr, exec);
+        new_block.reset(ptr, size);
+        return new_block;
+    }
+    void finalize_block(SplitPtrBlock &block, bool reset)
+    {
+        // This function handles setting the block to the right mode
+        // and free'ing maps that are not needed anymore.
+        // If `reset` is `true`, we won't allocate in this block anymore and
+        // we should free up resources that is not needed at runtime.
+        if (!(block.state & SplitPtrBlock::Alloc)) {
+            // A block that is not used this time, check if we need to free it.
+            if ((block.state & SplitPtrBlock::WRInit) && reset)
+                unmap_page((void*)block.wr_ptr, block.total);
+            return;
+        }
+        // For a block we used this time
+        if (block.state & SplitPtrBlock::InitAlloc) {
+            // For an initial block, we have a single RW map.
+            // Need to map it to RO or RX.
+            assert(!(block.state & (SplitPtrBlock::WRReady |
+                                    SplitPtrBlock::WRInit)));
+            protect_page(block.ptr, block.total, exec ? Prot::RX : Prot::RO);
+            block.state = 0;
+        }
+        else {
+            // For other ones, the runtime address has the correct mode.
+            // Need to map the write address to RO.
+            assert(block.state & SplitPtrBlock::WRInit);
+            assert(block.state & SplitPtrBlock::WRReady);
+            if (reset) {
+                unmap_page((void*)block.wr_ptr, block.total);
+            }
+            else {
+                protect_page((void*)block.wr_ptr, block.total, Prot::RO);
+                block.state = SplitPtrBlock::WRInit;
+            }
+        }
+    }
+public:
+    DualMapAllocator()
+    {
+        assert(anon_hdl != -1);
+    }
+    void finalize() override
+    {
+        for (auto &block : this->blocks) {
+            finalize_block(block, false);
+        }
+        for (auto &block : this->completed) {
+            finalize_block(block, true);
+            block.reset(nullptr, 0);
+        }
+        ROAllocator<exec>::finalize();
+    }
+};
+
+#ifdef _OS_LINUX_
+template<bool exec>
+class SelfMemAllocator : public ROAllocator<exec> {
+    SmallVector<Block, 16> temp_buff;
+protected:
+    void *get_wr_ptr(SplitPtrBlock &block, void *rt_ptr,
+                     size_t size, size_t align) override
+    {
+        assert(!(block.state & SplitPtrBlock::InitAlloc));
+        for (auto &wr_block: temp_buff) {
+            if (void *ptr = wr_block.alloc(size, align)) {
+                return ptr;
+            }
+        }
+        temp_buff.emplace_back();
+        Block &new_block = temp_buff.back();
+        size_t block_size = get_block_size(size);
+        new_block.reset(map_anon_page(block_size), block_size);
+        return new_block.alloc(size, align);
+    }
+    SplitPtrBlock alloc_block(size_t size) override
+    {
+        SplitPtrBlock new_block;
+        new_block.reset(map_anon_page(size), size);
+        return new_block;
+    }
+    void finalize_block(SplitPtrBlock &block, bool reset)
+    {
+        if (!(block.state & SplitPtrBlock::Alloc))
+            return;
+        if (block.state & SplitPtrBlock::InitAlloc) {
+            // for an initial block, we need to map it to ro or rx
+            assert(!(block.state & (SplitPtrBlock::WRReady |
+                                    SplitPtrBlock::WRInit)));
+            protect_page(block.ptr, block.total, exec ? Prot::RX : Prot::RO);
+            block.state = 0;
+        }
+    }
+public:
+    SelfMemAllocator()
+        : ROAllocator<exec>(),
+          temp_buff()
+    {
+        assert(self_mem_fd != -1);
+    }
+    void finalize() override
+    {
+        for (auto &block : this->blocks) {
+            finalize_block(block, false);
+        }
+        for (auto &block : this->completed) {
+            finalize_block(block, true);
+            block.reset(nullptr, 0);
+        }
+        for (auto &alloc : this->allocations) {
+            if (alloc.rt_addr == alloc.wr_addr)
+                continue;
+            write_self_mem(alloc.rt_addr, alloc.wr_addr, alloc.sz);
+        }
+        // clear all the temp buffers except the first one
+        // (we expect only one)
+        bool cached = false;
+        for (auto &block : temp_buff) {
+            if (cached) {
+                munmap(block.ptr, block.total);
+                block.ptr = nullptr;
+                block.total = block.avail = 0;
+            }
+            else {
+                block.avail = block.total;
+                cached = true;
+            }
+        }
+        if (cached)
+            temp_buff.resize(1);
+        ROAllocator<exec>::finalize();
+    }
+};
+#endif // _OS_LINUX_
+
+class RTDyldMemoryManagerJL : public SectionMemoryManager {
+    struct EHFrame {
+        uint8_t *addr;
+        size_t size;
+    };
+    RTDyldMemoryManagerJL(const RTDyldMemoryManagerJL&) = delete;
+    void operator=(const RTDyldMemoryManagerJL&) = delete;
+    SmallVector<EHFrame, 16> pending_eh;
+    RWAllocator rw_alloc;
+    std::unique_ptr<ROAllocator<false>> ro_alloc;
+    std::unique_ptr<ROAllocator<true>> exe_alloc;
+    bool code_allocated;
+
+public:
+    RTDyldMemoryManagerJL()
+        : SectionMemoryManager(),
+          pending_eh(),
+          rw_alloc(),
+          ro_alloc(),
+          exe_alloc(),
+          code_allocated(false)
+    {
+#ifdef _OS_LINUX_
+        if (!ro_alloc && init_self_mem() != -1) {
+            ro_alloc.reset(new SelfMemAllocator<false>());
+            exe_alloc.reset(new SelfMemAllocator<true>());
+        }
+#endif
+        if (!ro_alloc && init_shared_map() != -1) {
+            ro_alloc.reset(new DualMapAllocator<false>());
+            exe_alloc.reset(new DualMapAllocator<true>());
+        }
+    }
+    ~RTDyldMemoryManagerJL() override
+    {
+    }
+    void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr,
+                          size_t Size) override;
+    void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr,
+                            size_t Size) override;
+    uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
+                                 unsigned SectionID,
+                                 StringRef SectionName) override;
+    uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
+                                 unsigned SectionID, StringRef SectionName,
+                                 bool isReadOnly) override;
+#ifdef LLVM38
+    void notifyObjectLoaded(RuntimeDyld &Dyld,
+                            const object::ObjectFile &Obj) override;
+#endif
+    bool finalizeMemory(std::string *ErrMsg = nullptr) override;
+    template <typename DL, typename Alloc>
+    void mapAddresses(DL &Dyld, Alloc &&allocator)
+    {
+        for (auto &alloc: allocator->allocations) {
+            if (alloc.rt_addr == alloc.wr_addr || alloc.relocated)
+                continue;
+            alloc.relocated = true;
+            Dyld.mapSectionAddress(alloc.wr_addr, (uintptr_t)alloc.rt_addr);
+        }
+    }
+    template <typename DL>
+    void mapAddresses(DL &Dyld)
+    {
+        if (!ro_alloc)
+            return;
+        mapAddresses(Dyld, ro_alloc);
+        mapAddresses(Dyld, exe_alloc);
+    }
+#ifdef _OS_WINDOWS_
+    template <typename Alloc>
+    void *lookupWriteAddressFor(void *rt_addr, Alloc &&allocator)
+    {
+        for (auto &alloc: allocator->allocations) {
+            if (alloc.rt_addr == rt_addr) {
+                return alloc.wr_addr;
+            }
+        }
+        return nullptr;
+    }
+    void *lookupWriteAddressFor(void *rt_addr)
+    {
+        if (!ro_alloc)
+            return rt_addr;
+        if (void *ptr = lookupWriteAddressFor(rt_addr, ro_alloc))
+            return ptr;
+        if (void *ptr = lookupWriteAddressFor(rt_addr, exe_alloc))
+            return ptr;
+        return rt_addr;
+    }
+#endif // _OS_WINDOWS_
+};
+
+uint8_t *RTDyldMemoryManagerJL::allocateCodeSection(uintptr_t Size,
+                                                    unsigned Alignment,
+                                                    unsigned SectionID,
+                                                    StringRef SectionName)
+{
+    // allocating more than one code section can confuse libunwind.
+    assert(!code_allocated);
+    code_allocated = true;
+    if (exe_alloc)
+        return (uint8_t*)exe_alloc->alloc(Size, Alignment);
+    return SectionMemoryManager::allocateCodeSection(Size, Alignment, SectionID,
+                                                     SectionName);
+}
+
+uint8_t *RTDyldMemoryManagerJL::allocateDataSection(uintptr_t Size,
+                                                    unsigned Alignment,
+                                                    unsigned SectionID,
+                                                    StringRef SectionName,
+                                                    bool isReadOnly)
+{
+    if (!isReadOnly)
+        return (uint8_t*)rw_alloc.alloc(Size, Alignment);
+    if (ro_alloc)
+        return (uint8_t*)ro_alloc->alloc(Size, Alignment);
+    return SectionMemoryManager::allocateDataSection(Size, Alignment, SectionID,
+                                                     SectionName, isReadOnly);
+}
+
+#ifdef LLVM38
+void RTDyldMemoryManagerJL::notifyObjectLoaded(RuntimeDyld &Dyld,
+                                               const object::ObjectFile &Obj)
+{
+    if (!ro_alloc) {
+        assert(!exe_alloc);
+        SectionMemoryManager::notifyObjectLoaded(Dyld, Obj);
+        return;
+    }
+    assert(exe_alloc);
+    mapAddresses(Dyld);
+}
+#endif
+
+bool RTDyldMemoryManagerJL::finalizeMemory(std::string *ErrMsg)
+{
+    code_allocated = false;
+    if (ro_alloc) {
+        ro_alloc->finalize();
+        assert(exe_alloc);
+        exe_alloc->finalize();
+        for (auto &frame: pending_eh)
+            register_eh_frames(frame.addr, frame.size);
+        pending_eh.clear();
+        return false;
+    }
+    else {
+        assert(!exe_alloc);
+        return SectionMemoryManager::finalizeMemory(ErrMsg);
+    }
+}
+
+void RTDyldMemoryManagerJL::registerEHFrames(uint8_t *Addr,
+                                             uint64_t LoadAddr,
+                                             size_t Size)
+{
+    if (uintptr_t(Addr) == LoadAddr) {
+        register_eh_frames(Addr, Size);
+    }
+    else {
+        pending_eh.push_back(EHFrame{(uint8_t*)(uintptr_t)LoadAddr, Size});
+    }
+}
+
+void RTDyldMemoryManagerJL::deregisterEHFrames(uint8_t *Addr,
+                                               uint64_t LoadAddr,
+                                               size_t Size)
+{
+    deregister_eh_frames((uint8_t*)LoadAddr, Size);
+}
+
+}
+
+#ifndef LLVM38
+void notifyObjectLoaded(RTDyldMemoryManager *memmgr,
+                        llvm::orc::ObjectLinkingLayerBase::ObjSetHandleT H)
+{
+    ((RTDyldMemoryManagerJL*)memmgr)->mapAddresses(**H);
+}
+#endif
+
+#ifdef _OS_WINDOWS_
+void *lookupWriteAddressFor(RTDyldMemoryManager *memmgr, void *rt_addr)
+{
+    return ((RTDyldMemoryManagerJL*)memmgr)->lookupWriteAddressFor(rt_addr);
+}
+#endif
+
+#else // LLVM37
+typedef SectionMemoryManager RTDyldMemoryManagerJL;
+#endif // LLVM37
+
+RTDyldMemoryManager* createRTDyldMemoryManager()
+{
+    return new RTDyldMemoryManagerJL();
+}
+#endif // USE_MCJIT
diff --git a/src/codegen.cpp b/src/codegen.cpp
index 30aaa00312cc3..869fc5e21ee97 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -340,9 +340,6 @@ static GlobalVariable *jldll_var;
 JITMemoryManager *createJITMemoryManagerWin();
 #endif
 #endif //_OS_WINDOWS_
-#ifdef USE_MCJIT
-RTDyldMemoryManager *createRTDyldMemoryManager();
-#endif
 
 static Function *jltls_states_func;
 #ifndef JULIA_ENABLE_THREADING
diff --git a/src/codegen_internal.h b/src/codegen_internal.h
index f939edc118807..07388aa5dcb31 100644
--- a/src/codegen_internal.h
+++ b/src/codegen_internal.h
@@ -1,5 +1,11 @@
 // This file is a part of Julia. License is MIT: http://julialang.org/license
 
+#if defined(LLVM38) && !defined(LLVM37)
+#  include <llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h>
+void notifyObjectLoaded(RTDyldMemoryManager *memmgr,
+                        llvm::orc::ObjectLinkingLayerBase::ObjSetHandleT H);
+#endif
+
 // Declarations for disasm.cpp
 extern "C"
 void jl_dump_asm_internal(uintptr_t Fptr, size_t Fsize, int64_t slide,
@@ -28,8 +34,16 @@ extern bool jl_dylib_DI_for_fptr(size_t pointer, const object::ObjectFile **obje
         bool onlySysImg, bool *isSysImg, void **saddr, char **name, char **filename);
 
 #ifdef USE_ORCJIT
-extern JL_DLLEXPORT void ORCNotifyObjectEmitted(JITEventListener *Listener,
-                                      const object::ObjectFile &obj,
-                                      const object::ObjectFile &debugObj,
-                                      const RuntimeDyld::LoadedObjectInfo &L);
+JL_DLLEXPORT void ORCNotifyObjectEmitted(JITEventListener *Listener,
+                                         const object::ObjectFile &obj,
+                                         const object::ObjectFile &debugObj,
+                                         const RuntimeDyld::LoadedObjectInfo &L,
+                                         RTDyldMemoryManager *memmgr);
+#ifdef _OS_WINDOWS_
+void *lookupWriteAddressFor(RTDyldMemoryManager *memmgr, void *rt_addr);
+#endif
+#endif
+
+#ifdef USE_MCJIT
+RTDyldMemoryManager* createRTDyldMemoryManager(void);
 #endif
diff --git a/src/debuginfo.cpp b/src/debuginfo.cpp
index a62e6a81a8721..10184e317c85e 100644
--- a/src/debuginfo.cpp
+++ b/src/debuginfo.cpp
@@ -298,12 +298,13 @@ class JuliaJITEventListener: public JITEventListener
     virtual void NotifyObjectEmitted(const object::ObjectFile &obj,
                                      const RuntimeDyld::LoadedObjectInfo &L)
     {
-        return _NotifyObjectEmitted(obj,obj,L);
+        return _NotifyObjectEmitted(obj,obj,L,nullptr);
     }
 
     virtual void _NotifyObjectEmitted(const object::ObjectFile &obj,
-                                     const object::ObjectFile &debugObj,
-                                     const RuntimeDyld::LoadedObjectInfo &L)
+                                      const object::ObjectFile &debugObj,
+                                      const RuntimeDyld::LoadedObjectInfo &L,
+                                      RTDyldMemoryManager *memmgr)
 #else
     virtual void NotifyObjectEmitted(const ObjectImage &obj)
 #endif
@@ -340,6 +341,7 @@ class JuliaJITEventListener: public JITEventListener
         uint64_t SectionAddrCheck = 0; // assert that all of the Sections are at the same location
         uint8_t *UnwindData = NULL;
 #if defined(_CPU_X86_64_)
+        uint64_t SectionLoadOffset = 1; // The real offset shouldn't be 1.
         uint8_t *catchjmp = NULL;
         for (const object::SymbolRef &sym_iter : debugObj.symbols()) {
             StringRef sName;
@@ -400,25 +402,37 @@ class JuliaJITEventListener: public JITEventListener
                     assert(SectionAddrCheck == SectionLoadAddr);
                 else
                     SectionAddrCheck = SectionLoadAddr;
+#ifdef USE_ORCJIT
+                if (memmgr)
+                    SectionAddr =
+                        (uintptr_t)lookupWriteAddressFor(memmgr,
+                                                         (void*)SectionLoadAddr);
+#endif
+                if (SectionLoadOffset != 1)
+                    assert(SectionLoadOffset == SectionAddr - SectionLoadAddr);
+                else
+                    SectionLoadOffset = SectionAddr - SectionLoadAddr;
             }
         }
         assert(catchjmp);
         assert(UnwindData);
         assert(SectionAddrCheck);
-        catchjmp[0] = 0x48;
-        catchjmp[1] = 0xb8; // mov RAX, QWORD PTR [&_seh_exception_handle]
-        *(uint64_t*)(&catchjmp[2]) = (uint64_t)&_seh_exception_handler;
-        catchjmp[10] = 0xff;
-        catchjmp[11] = 0xe0; // jmp RAX
-        UnwindData[0] = 0x09; // version info, UNW_FLAG_EHANDLER
-        UnwindData[1] = 4;    // size of prolog (bytes)
-        UnwindData[2] = 2;    // count of unwind codes (slots)
-        UnwindData[3] = 0x05; // frame register (rbp) = rsp
-        UnwindData[4] = 4;    // second instruction
-        UnwindData[5] = 0x03; // mov RBP, RSP
-        UnwindData[6] = 1;    // first instruction
-        UnwindData[7] = 0x50; // push RBP
-        *(DWORD*)&UnwindData[8] = (DWORD)(catchjmp - (uint8_t*)SectionAddrCheck); // relative location of catchjmp
+        assert(SectionLoadOffset != 1);
+        catchjmp[SectionLoadOffset] = 0x48;
+        catchjmp[SectionLoadOffset + 1] = 0xb8; // mov RAX, QWORD PTR [&_seh_exception_handle]
+        *(uint64_t*)(&catchjmp[SectionLoadOffset + 2]) =
+            (uint64_t)&_seh_exception_handler;
+        catchjmp[SectionLoadOffset + 10] = 0xff;
+        catchjmp[SectionLoadOffset + 11] = 0xe0; // jmp RAX
+        UnwindData[SectionLoadOffset] = 0x09; // version info, UNW_FLAG_EHANDLER
+        UnwindData[SectionLoadOffset + 1] = 4;    // size of prolog (bytes)
+        UnwindData[SectionLoadOffset + 2] = 2;    // count of unwind codes (slots)
+        UnwindData[SectionLoadOffset + 3] = 0x05; // frame register (rbp) = rsp
+        UnwindData[SectionLoadOffset + 4] = 4;    // second instruction
+        UnwindData[SectionLoadOffset + 5] = 0x03; // mov RBP, RSP
+        UnwindData[SectionLoadOffset + 6] = 1;    // first instruction
+        UnwindData[SectionLoadOffset + 7] = 0x50; // push RBP
+        *(DWORD*)&UnwindData[SectionLoadOffset + 8] = (DWORD)(catchjmp - (uint8_t*)SectionAddrCheck); // relative location of catchjmp
 #endif // defined(_OS_X86_64_)
 #endif // defined(_OS_WINDOWS_)
 
@@ -606,9 +620,10 @@ class JuliaJITEventListener: public JITEventListener
 JL_DLLEXPORT void ORCNotifyObjectEmitted(JITEventListener *Listener,
                                          const object::ObjectFile &obj,
                                          const object::ObjectFile &debugObj,
-                                         const RuntimeDyld::LoadedObjectInfo &L)
+                                         const RuntimeDyld::LoadedObjectInfo &L,
+                                         RTDyldMemoryManager *memmgr)
 {
-    ((JuliaJITEventListener*)Listener)->_NotifyObjectEmitted(obj,debugObj,L);
+    ((JuliaJITEventListener*)Listener)->_NotifyObjectEmitted(obj,debugObj,L,memmgr);
 }
 #endif
 
@@ -1665,7 +1680,7 @@ void register_eh_frames(uint8_t *Addr, size_t Size)
             if (start < start_ip)
                 start_ip = start;
             if (end_ip < (start + size))
-                end_ip = start+size;
+                end_ip = start + size;
             table[cur_entry].fde_offset =
                 safe_trunc<int32_t>((intptr_t)Entry - (intptr_t)Addr);
             start_ips[cur_entry] = start;
diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp
index 69bab17e9f9a2..cd1203b2586de 100644
--- a/src/jitlayers.cpp
+++ b/src/jitlayers.cpp
@@ -131,44 +131,13 @@ static void addOptimizationPasses(T *PM)
     //PM->add(createCFGSimplificationPass());     // Merge & remove BBs
 }
 
-#ifdef USE_MCJIT
-#ifdef LLVM37
-class RTDyldMemoryManagerJL : public SectionMemoryManager {
-    RTDyldMemoryManagerJL(const RTDyldMemoryManagerJL&) = delete;
-    void operator=(const RTDyldMemoryManagerJL&) = delete;
-
-public:
-    RTDyldMemoryManagerJL() {};
-    ~RTDyldMemoryManagerJL() override {};
-    void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) override;
-    void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) override;
-};
-
-void RTDyldMemoryManagerJL::registerEHFrames(uint8_t *Addr,
-                                             uint64_t LoadAddr,
-                                             size_t Size)
-{
-    register_eh_frames(Addr, Size);
-}
-
-void RTDyldMemoryManagerJL::deregisterEHFrames(uint8_t *Addr,
-                                               uint64_t LoadAddr,
-                                               size_t Size)
-{
-    deregister_eh_frames(Addr, Size);
-}
-#else
-typedef SectionMemoryManager RTDyldMemoryManagerJL;
-#endif
+#ifdef USE_ORCJIT
 
-RTDyldMemoryManager* createRTDyldMemoryManager()
-{
-    return new RTDyldMemoryManagerJL();
-}
+#ifndef LLVM38
+void notifyObjectLoaded(RTDyldMemoryManager *memmgr,
+                        llvm::orc::ObjectLinkingLayerBase::ObjSetHandleT H);
 #endif
 
-#ifdef USE_ORCJIT
-
 // ------------------------ TEMPORARILY COPIED FROM LLVM -----------------
 // This must be kept in sync with gdb/gdb/jit.h .
 extern "C" {
@@ -259,6 +228,9 @@ class JuliaOJIT {
         void operator()(ObjectLinkingLayerBase::ObjSetHandleT H, const ObjSetT &Objects,
                         const LoadResult &LOS)
         {
+#ifndef LLVM38
+            notifyObjectLoaded(JIT.MemMgr, H);
+#endif
             auto oit = Objects.begin();
             auto lit = LOS.begin();
             for (; oit != Objects.end(); ++oit, ++lit) {
@@ -290,7 +262,7 @@ class JuliaOJIT {
                 ORCNotifyObjectEmitted(JuliaListener.get(),
                         *Object,
                         *SavedObjects.back().getBinary(),
-                        *LO);
+                        *LO, JIT.MemMgr);
 
                 // record all of the exported symbols defined in this object
                 // in the primary hash table for the enclosing JIT
@@ -416,7 +388,7 @@ class JuliaOJIT {
     }
 
 
-    ModuleHandleT addModule(std::unique_ptr<Module> M)
+    void addModule(std::unique_ptr<Module> M)
     {
 #ifndef NDEBUG
         // validate the relocations for M
@@ -461,7 +433,6 @@ class JuliaOJIT {
         // Force LLVM to emit the module so that we can register the symbols
         // in our lookup table.
         CompileLayer.emitAndFinalize(modset);
-        return modset;
     }
 
     void removeModule(ModuleHandleT H)