-
Notifications
You must be signed in to change notification settings - Fork 106
Open
Description
Error
*** Aborted at 1752285219 (Unix time, try 'date -d @1752285219') ***
*** Signal 11 (SIGSEGV) (0x3f800000) received by PID 2085122 (pthread TID 0x7f83c5ffe640) (linux TID 2086542) (code: address not mapped to object), stack trace: ***
@ 000000000b285025 folly::symbolizer::(anonymous namespace)::signalHandler(int, siginfo_t*, void*)
./fbcode/folly/debugging/symbolizer/SignalHandler.cpp:486
@ 000000000003ebef (unknown)
@ 0000000000059809 c10::cuda::CUDAKernelLaunchRegistry::has_failed() const
@ 000000000005a83c c10::cuda::c10_cuda_check_implementation(int, char const*, char const*, int, bool)
@ 00000000011f1555 c10d::ProcessGroupNCCL::WorkNCCL::finishedGPUExecutionInternal() const
@ 00000000011fe8bf c10d::ProcessGroupNCCL::WorkNCCL::isCompleted()
@ 0000000001200556 c10d::ProcessGroupNCCL::watchdogHandler()
@ 00000000012016ec c10d::ProcessGroupNCCL::ncclCommWatchdog()
@ 00000000000145bf execute_native_thread_routine
@ 000000000008a4d9 start_thread
@ 000000000010f54f __clone3
Reproduction:
from monarch.actor import Actor, endpoint, MonarchContext, proc_mesh
import os
import asyncio
import socket
import torch
class TestAllGatherActor(Actor):
def __init__(self, size) -> None:
super().__init__()
self.size = size
@endpoint
def init(self):
torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
torch.distributed.init_process_group()
self.tensor = torch.zeros(size=(self.size,), dtype=torch.int64, device="cuda")
@endpoint
def all_gather(self):
output = torch.zeros(
size=(self.tensor.shape[0] * self.size,),
dtype=self.tensor.dtype,
device=self.tensor.device,
)
torch.distributed.all_gather_into_tensor(output, self.tensor, async_op=False)
return output
class SetupActor(Actor):
@endpoint # type: ignore
def setup_env(self, master_addr: str, master_port: int):
ctx = MonarchContext.get()
rank = ctx.point.rank
world_size = len(ctx.point)
local_world_size = 8
local_rank = rank % local_world_size
env_vars = {
"WORLD_SIZE": str(world_size),
"RANK": str(rank),
"MASTER_ADDR": master_addr,
"MASTER_PORT": str(master_port),
"LOCAL_WORLD_SIZE": str(local_world_size),
"LOCAL_RANK": str(local_rank),
}
os.environ.update(env_vars)
@endpoint # type: ignore
def get_hostname(self) -> str:
return socket.gethostname()
async def _main():
pm = await proc_mesh(gpus=4)
setup_actor = pm.spawn("setup", SetupActor).get()
master_addr = (
await setup_actor.flatten("anon").slice(anon=0).get_hostname.call_one()
)
master_port = 20122
await setup_actor.setup_env.call(master_addr, master_port)
am = await pm.spawn("foo", TestAllGatherActor, size=4)
await am.init.call()
await am.all_gather.call()
await pm.stop()
asyncio.run(_main())Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working