Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Olli/api extension #66

Merged
merged 60 commits into from
May 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
0bc3c3e
Core API teasing out WIP
olsaarik Apr 22, 2023
35ade68
IB in cpp style WIP
chhwang Apr 23, 2023
e4ee2eb
WIP Connection in C++
olsaarik Apr 25, 2023
90a8860
Registered memory (de)serialization and Connection work
olsaarik Apr 26, 2023
d746201
WIP builds, but doesn't link
olsaarik Apr 26, 2023
5443ed1
ConnectionSetup stuff
olsaarik Apr 26, 2023
7a865d9
merged with saemal/api-extension
Apr 26, 2023
9c6e685
connectionSetup() for IBConnection
olsaarik Apr 26, 2023
7c87ca3
Missing functions and TODOs
olsaarik Apr 27, 2023
d096874
TODO updates
olsaarik Apr 27, 2023
0e9f6fa
TODOs
olsaarik Apr 27, 2023
47d4606
Add registerMemory
olsaarik Apr 27, 2023
08e80f1
IB: completely replaced with C++ interfaces
chhwang Apr 27, 2023
7641038
wip
Apr 27, 2023
7913d90
Merge branch 'olli/api-extension' of https://github.com/microsoft/msc…
Apr 27, 2023
c24896b
bootstrap to the communicator
Apr 27, 2023
b0c7e86
Communicator owns IB contexts
chhwang Apr 27, 2023
df80d88
connect test
Apr 27, 2023
4d7a4a2
Merge branch 'olli/api-extension' of https://github.com/microsoft/msc…
Apr 27, 2023
8eda636
testing connection setup
Apr 27, 2023
06c6df2
Separate out Transport and TransportFlags
olsaarik Apr 27, 2023
aaa3f0e
host hashes in communicator
Apr 27, 2023
e18e26d
tests for host hash
Apr 27, 2023
afc5887
moving the debug info into other levels
Apr 27, 2023
82c2762
ipc uses a base ptr now
Apr 27, 2023
2ead25d
INFO for IPC handle opened
Apr 27, 2023
cbfc218
registered buffer test
Apr 27, 2023
962e63b
deserializing registered memory is failing -- commented out
Apr 27, 2023
fa0fcb4
Lazy CUDA IPC handle opening
olsaarik Apr 28, 2023
821ba7a
Fix compilation
olsaarik Apr 28, 2023
cbefe38
aad conn write test
Binyang2014 Apr 28, 2023
750c40b
Fix
Binyang2014 Apr 28, 2023
04e8784
Work on a channel service
olsaarik Apr 28, 2023
7d1f038
fixes for ib send/recv tests
Apr 29, 2023
1c9dacd
fix ib test -- ib polling is running into issues
Apr 29, 2023
88426ad
bug fix for ib memory registeration
May 1, 2023
8a5a787
test bug fix
May 1, 2023
5b7e76c
all tests are passing with memory registeration
May 1, 2023
961f5b3
more debbuging info + testing 1000 memory registerations
May 2, 2023
6aa023e
moving serializer outside
May 2, 2023
fe2b778
flushing the full cq
May 2, 2023
358c3d6
Generalize connectionSetup() into setup()
olsaarik May 2, 2023
c7b7d20
Export epoch header
olsaarik May 2, 2023
66ce01b
Make NonblockingFuture copyable
olsaarik May 2, 2023
c44b48b
Epoch non-copyable
olsaarik May 2, 2023
a4e6ffe
epoch creation
May 2, 2023
fc12947
fixing flush for IB
May 2, 2023
4ba8516
allgather_test_cpp functional again
olsaarik May 2, 2023
54d1e18
testing writes with signal is passing
May 2, 2023
6002a52
solved merge conflict
May 2, 2023
81e7d1b
Channels work
olsaarik May 3, 2023
39666f9
Quick fix
olsaarik May 3, 2023
4a41c19
Fix performance bug and base pointer offset
olsaarik May 3, 2023
7af6879
removing old mscclppComm_t comm from communicator
May 3, 2023
518f325
kernel 2 is also performant
May 3, 2023
503cdd5
CMake build system transition WIP
olsaarik Apr 13, 2023
09d5f7c
Fixes for cmake
olsaarik May 4, 2023
bd2121a
CMake improvement
olsaarik May 4, 2023
d710360
Only build C++ tests in CMake
olsaarik May 4, 2023
ddc9e68
Add ib_test to CMake
olsaarik May 4, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
cmake_minimum_required(VERSION 3.26)

project(mscclpp LANGUAGES CUDA CXX)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)

list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/modules)

find_package(CUDAToolkit REQUIRED)
find_package(IBVerbs REQUIRED)
find_package(NUMA REQUIRED)
find_package(GDRCopy)

option(USE_MPI_FOR_TESTS "Use MPI for tests" ON)
if(USE_MPI_FOR_TESTS)
find_package(MPI REQUIRED)
add_definitions(-DMSCCLPP_USE_MPI_FOR_TESTS)
endif()

include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})

add_library(mscclpp SHARED)
add_subdirectory(src) # This adds the srouces to the mscclpp target
target_include_directories(mscclpp PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/src/include)
set_target_properties(mscclpp PROPERTIES LINKER_LANGUAGE CXX)
target_link_libraries(mscclpp PRIVATE MSCCLPP::ibverbs MSCCLPP::numa CUDA::cudart CUDA::cuda_driver)
if(GDRCOPY_FOUND)
target_link_libraries(mscclpp PRIVATE MSCCLPP::gdrcopy)
endif()

add_subdirectory(tests)
9 changes: 5 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ endif

NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 --expt-extended-lambda -Xfatbin -compress-all
# Use addprefix so that we can specify more than one path
NVLDFLAGS := -L$(CUDA_LIB) -lcudart -lrt
NVLDFLAGS := -L$(CUDA_LIB) -lcudart -lrt -lcuda

ifeq ($(DEBUG), 0)
NVCUFLAGS += -O3
Expand Down Expand Up @@ -120,7 +120,8 @@ LDFLAGS := $(NVLDFLAGS) $(GDRCOPY_LDFLAGS) -libverbs -lnuma

LIBSRCS := $(addprefix src/,debug.cc utils.cc init.cc proxy.cc ib.cc config.cc)
LIBSRCS += $(addprefix src/bootstrap/,bootstrap.cc socket.cc)
LIBSRCS += $(addprefix src/,communicator.cc fifo.cc host_connection.cc proxy_cpp.cc basic_proxy_handler.cc)
LIBSRCS += $(addprefix src/,communicator.cc connection.cc registered_memory.cc)
LIBSRCS += $(addprefix src/,epoch.cc proxy_cpp.cc fifo.cc channel.cc)
ifneq ($(NPKIT), 0)
LIBSRCS += $(addprefix src/misc/,npkit.cc)
endif
Expand All @@ -134,7 +135,7 @@ HEADERS := $(wildcard src/include/*.h)
CPPSOURCES := $(shell find ./ -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)' -not -path "./build/*" -not -path "./python/*")
PYTHONCPPSOURCES := $(shell find ./python/src/ -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)')

INCEXPORTS := mscclpp.h mscclppfifo.h mscclpp.hpp mscclppfifo.hpp
INCEXPORTS := mscclpp.h mscclppfifo.h mscclpp.hpp mscclppfifo.hpp epoch.hpp
INCTARGETS := $(INCEXPORTS:%=$(BUILDDIR)/$(INCDIR)/%)

LIBNAME := libmscclpp.so
Expand All @@ -148,7 +149,7 @@ UTOBJTARGETS := $(UTOBJS:%=$(BUILDDIR)/$(OBJDIR)/%)
UTBINS := $(patsubst %.o,$(BUILDDIR)/$(BINDIR)/%,$(UTOBJS))

TESTSDIR := tests
TESTSSRCS := $(addprefix $(TESTSDIR)/,bootstrap_test.cc allgather_test_standalone.cu allgather_test_cpp.cu bootstrap_test_cpp.cc)
TESTSSRCS := $(addprefix $(TESTSDIR)/,bootstrap_test.cc allgather_test_standalone.cu communicator_test_cpp.cu bootstrap_test_cpp.cc allgather_test_cpp.cu)
TESTSOBJS := $(patsubst %.cc,%.o,$(TESTSSRCS)) $(patsubst %.cu,%.o,$(TESTSSRCS))
TESTSOBJTARGETS := $(TESTSOBJS:%=$(BUILDDIR)/$(OBJDIR)/%)
TESTSBINS := $(patsubst %.o,$(BUILDDIR)/$(BINDIR)/%,$(TESTSOBJS))
Expand Down
8 changes: 8 additions & 0 deletions TODO.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Core API extraction

- Add a test for host side Communicator/RegisteredMemory/Connection use.
- Implement a standalone "epoch" synchronization construct that can be used as a component in custom proxies. epoch.hpp/cc has the beginnings of this.
- Reimplement the "standard" proxy service + DeviceConnection on top of the new Communicator/RegisteredMemory/Connection core API. Remants of the old code is in channel.hpp, basic_proxy_handler.hpp/cc and host_connection.hpp/cc. Probably need a manager class to wrap all of this.
- Change the new IBConnection and Communicator to use the new C++ IbCtx and IbQp classes.
- Implement IbQp::~IbQp()
- Fix RegisteredMemory::Impl::Impl to get the IPC handle from the base pointer, not the derived pointer.
41 changes: 41 additions & 0 deletions cmake/modules/FindGDRCopy.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Find the GDRCopy libraries
#
# The following variables are optionally searched for defaults
# GDRCOPY_ROOT_DIR: Base directory where all GDRCopy components are found
# GDRCOPY_INCLUDE_DIR: Directory where GDRCopy headers are found
# GDRCOPY_LIB_DIR: Directory where GDRCopy libraries are found

# The following are set after configuration is done:
# GDRCOPY_FOUND
# GDRCOPY_INCLUDE_DIRS
# GDRCOPY_LIBRARIES

# An imported target MSCCLPP::gdrcopy is created if the library is found.

find_path(GDRCOPY_INCLUDE_DIRS
NAMES gdrapi.h
HINTS
${GDRCOPY_INCLUDE_DIR}
${GDRCOPY_ROOT_DIR}
${GDRCOPY_ROOT_DIR}/include)

find_library(GDRCOPY_LIBRARIES
NAMES gdrapi
HINTS
${GDRCOPY_LIB_DIR}
${GDRCOPY_ROOT_DIR}
${GDRCOPY_ROOT_DIR}/lib)

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(GDRCopy DEFAULT_MSG GDRCOPY_INCLUDE_DIRS GDRCOPY_LIBRARIES)
mark_as_advanced(GDRCOPY_INCLUDE_DIR GDRCOPY_LIBRARIES)

if(GDRCOPY_FOUND)
if(NOT TARGET MSCCLPP::gdrcopy)
add_library(MSCCLPP::gdrcopy UNKNOWN IMPORTED)
endif()
set_target_properties(MSCCLPP::gdrcopy PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${GDRCOPY_INCLUDE_DIR}"
IMPORTED_LINK_INTERFACE_LANGUAGES "C"
IMPORTED_LOCATION "${GDRCOPY_LIBRARIES}")
endif()
41 changes: 41 additions & 0 deletions cmake/modules/FindIBVerbs.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Find the IB Verbs libraries
#
# The following variables are optionally searched for defaults
# IBVERBS_ROOT_DIR: Base directory where all ibverbs components are found
# IBVERBS_INCLUDE_DIR: Directory where ibverbs headers are found
# IBVERBS_LIB_DIR: Directory where ibverbs libraries are found

# The following are set after configuration is done:
# IBVERBS_FOUND
# IBVERBS_INCLUDE_DIRS
# IBVERBS_LIBRARIES

# An imported target MSCCLPP::ibverbs is created if the library is found.

find_path(IBVERBS_INCLUDE_DIRS
NAMES infiniband/verbs.h
HINTS
${IBVERBS_INCLUDE_DIR}
${IBVERBS_ROOT_DIR}
${IBVERBS_ROOT_DIR}/include)

find_library(IBVERBS_LIBRARIES
NAMES ibverbs
HINTS
${IBVERBS_LIB_DIR}
${IBVERBS_ROOT_DIR}
${IBVERBS_ROOT_DIR}/lib)

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(IBVerbs DEFAULT_MSG IBVERBS_INCLUDE_DIRS IBVERBS_LIBRARIES)
mark_as_advanced(IBVERBS_INCLUDE_DIR IBVERBS_LIBRARIES)

if(IBVERBS_FOUND)
if(NOT TARGET MSCCLPP::ibverbs)
add_library(MSCCLPP::ibverbs UNKNOWN IMPORTED)
endif()
set_target_properties(MSCCLPP::ibverbs PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${IBVERBS_INCLUDE_DIR}"
IMPORTED_LINK_INTERFACE_LANGUAGES "C"
IMPORTED_LOCATION "${IBVERBS_LIBRARIES}")
endif()
41 changes: 41 additions & 0 deletions cmake/modules/FindNUMA.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Find the numa libraries
#
# The following variables are optionally searched for defaults
# NUMA_ROOT_DIR: Base directory where all numa components are found
# NUMA_INCLUDE_DIR: Directory where numa headers are found
# NUMA_LIB_DIR: Directory where numa libraries are found

# The following are set after configuration is done:
# NUMA_FOUND
# NUMA_INCLUDE_DIRS
# NUMA_LIBRARIES

# An imported target MSCCLPP::numa is created if the library is found.

find_path(NUMA_INCLUDE_DIRS
NAMES numa.h
HINTS
${NUMA_INCLUDE_DIR}
${NUMA_ROOT_DIR}
${NUMA_ROOT_DIR}/include)

find_library(NUMA_LIBRARIES
NAMES numa
HINTS
${NUMA_LIB_DIR}
${NUMA_ROOT_DIR}
${NUMA_ROOT_DIR}/lib)

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(NUMA DEFAULT_MSG NUMA_INCLUDE_DIRS NUMA_LIBRARIES)
mark_as_advanced(NUMA_INCLUDE_DIR NUMA_LIBRARIES)

if(NUMA_FOUND)
if(NOT TARGET MSCCLPP::numa)
add_library(MSCCLPP::numa UNKNOWN IMPORTED)
endif()
set_target_properties(MSCCLPP::numa PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${NUMA_INCLUDE_DIR}"
IMPORTED_LINK_INTERFACE_LANGUAGES "C"
IMPORTED_LOCATION "${NUMA_LIBRARIES}")
endif()
5 changes: 5 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc *.h)
file(GLOB to_remove gdr.cc)
list(REMOVE_ITEM SOURCES ${to_remove})

target_sources(mscclpp PRIVATE ${SOURCES})
29 changes: 0 additions & 29 deletions src/basic_proxy_handler.cc

This file was deleted.

10 changes: 5 additions & 5 deletions src/bootstrap/bootstrap.cc
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,8 @@ Bootstrap::Impl::~Impl()
}
}

void Bootstrap::Impl::getRemoteAddresses(mscclppSocket* listenSock,
std::vector<mscclppSocketAddress>& rankAddresses,
std::vector<mscclppSocketAddress>& rankAddressesRoot, int& rank)
void Bootstrap::Impl::getRemoteAddresses(mscclppSocket* listenSock, std::vector<mscclppSocketAddress>& rankAddresses,
std::vector<mscclppSocketAddress>& rankAddressesRoot, int& rank)
{
mscclppSocket sock;
ExtInfo info;
Expand Down Expand Up @@ -211,7 +210,7 @@ void Bootstrap::Impl::getRemoteAddresses(mscclppSocket* listenSock,
}

void Bootstrap::Impl::sendHandleToPeer(int peer, const std::vector<mscclppSocketAddress>& rankAddresses,
const std::vector<mscclppSocketAddress>& rankAddressesRoot)
const std::vector<mscclppSocketAddress>& rankAddressesRoot)
{
mscclppSocket sock;
int next = (peer + 1) % this->nRanks_;
Expand All @@ -226,7 +225,8 @@ void Bootstrap::Impl::bootstrapCreateRoot()
mscclppSocket listenSock;

// mscclppSocket* listenSock = new mscclppSocket(); // TODO(saemal) make this a shared ptr
MSCCLPPTHROW(mscclppSocketInit(&listenSock, &uniqueId_.addr, uniqueId_.magic, mscclppSocketTypeBootstrap, nullptr, 0));
MSCCLPPTHROW(
mscclppSocketInit(&listenSock, &uniqueId_.addr, uniqueId_.magic, mscclppSocketTypeBootstrap, nullptr, 0));
MSCCLPPTHROW(mscclppSocketListen(&listenSock));
MSCCLPPTHROW(mscclppSocketGetAddr(&listenSock, &uniqueId_.addr));
auto lambda = [this, listenSock]() { this->bootstrapRoot(listenSock); };
Expand Down
26 changes: 26 additions & 0 deletions src/channel.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#include "channel.hpp"
#include "utils.h"
#include "checks.hpp"
#include "api.h"
#include "debug.h"

namespace mscclpp {
namespace channel {

MSCCLPP_API_CPP DeviceChannelService::DeviceChannelService(Communicator& communicator) : communicator_(communicator),
proxy_([&](ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }, [&]() { bindThread(); }) {
int cudaDevice;
CUDATHROW(cudaGetDevice(&cudaDevice));
MSCCLPPTHROW(getDeviceNumaNode(cudaDevice, &deviceNumaNode));
}

MSCCLPP_API_CPP void DeviceChannelService::bindThread()
{
if (deviceNumaNode >= 0) {
MSCCLPPTHROW(numaBind(deviceNumaNode));
INFO(MSCCLPP_INIT, "NUMA node of DeviceChannelService proxy thread is set to %d", deviceNumaNode);
}
}

} // namespace channel
} // namespace mscclpp
Loading