You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
INFO 11-20 12:36:49 custom_all_reduce_utils.py:204] generating GPU P2P access cache in /root/.cache/vllm/gpu_p2p_access_cache_for_0,1,2,3.json
[rank0]: Traceback (most recent call last):
[rank0]: File "/vllm/vllm/distributed/device_communicators/custom_all_reduce_utils.py", line 227, in gpu_p2p_access_check
[rank0]: returned.check_returncode()
[rank0]: File "/usr/lib/python3.10/subprocess.py", line 457, in check_returncode
[rank0]: raise CalledProcessError(self.returncode, self.args, self.stdout,
[rank0]: subprocess.CalledProcessError: Command '['/usr/bin/python', '/lpai/volumes/data-yyj/gzkp/vllm/vllm/distributed/device_communicators/custom_all_reduce_utils.py']' returned non-zero exit status 1.
[rank0]: The above exception was the direct cause of the following exception:
[rank0]: Traceback (most recent call last):
[rank0]: File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
[rank0]: return _run_code(code, main_globals, None,
[rank0]: File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
[rank0]: exec(code, run_globals)
[rank0]: File "/vllm/vllm/entrypoints/openai/api_server.py", line 676, in <module>
[rank0]: uvloop.run(run_server(args))
[rank0]: File "/usr/local/lib/python3.10/dist-packages/uvloop/__init__.py", line 82, in run
[rank0]: return loop.run_until_complete(wrapper())
[rank0]: File "uvloop/loop.pyx", line 1517, in uvloop.loop.Loop.run_until_complete
[rank0]: File "/usr/local/lib/python3.10/dist-packages/uvloop/__init__.py", line 61, in wrapper
[rank0]: return await main
[rank0]: File "/vllm/vllm/entrypoints/openai/api_server.py", line 643, in run_server
[rank0]: async with build_async_engine_client(args) as engine_client:
[rank0]: File "/usr/lib/python3.10/contextlib.py", line 199, in __aenter__
[rank0]: return await anext(self.gen)
[rank0]: File "/vllm/vllm/entrypoints/openai/api_server.py", line 107, in build_async_engine_client
[rank0]: async with build_async_engine_client_from_engine_args(
[rank0]: File "/usr/lib/python3.10/contextlib.py", line 199, in __aenter__
[rank0]: return await anext(self.gen)
[rank0]: File "/vllm/vllm/entrypoints/openai/api_server.py", line 141, in build_async_engine_client_from_engine_args
[rank0]: engine_client = await asyncio.get_running_loop().run_in_executor(
[rank0]: File "/usr/lib/python3.10/concurrent/futures/thread.py", line 58, in run
[rank0]: result = self.fn(*self.args, **self.kwargs)
[rank0]: File "/vllm/vllm/engine/async_llm_engine.py", line 682, in from_engine_args
[rank0]: engine = cls(
[rank0]: File "vllm/vllm/engine/async_llm_engine.py", line 577, in __init__
[rank0]: self.engine = self._engine_class(*args, **kwargs)
[rank0]: File "/vllm/vllm/engine/async_llm_engine.py", line 263, in __init__
[rank0]: super().__init__(*args, **kwargs)
[rank0]: File "/vllm/vllm/engine/llm_engine.py", line 347, in __init__
[rank0]: self.model_executor = executor_class(vllm_config=vllm_config, )
[rank0]: File "/vllm/vllm/executor/multiproc_gpu_executor.py", line 215, in __init__
[rank0]: super().__init__(*args, **kwargs)
[rank0]: File "/vllm/vllm/executor/distributed_gpu_executor.py", line 26, in __init__
[rank0]: super().__init__(*args, **kwargs)
[rank0]: File "/vllm/vllm/executor/executor_base.py", line 36, in __init__
[rank0]: self._init_executor()
[rank0]: File "/vllm/vllm/executor/multiproc_gpu_executor.py", line 110, in _init_executor
[rank0]: self._run_workers("init_device")
[rank0]: File "/vllm/vllm/executor/multiproc_gpu_executor.py", line 192, in _run_workers
[rank0]: driver_worker_output = driver_worker_method(*args, **kwargs)
[rank0]: File "/vllm/vllm/worker/worker.py", line 148, in init_device
[rank0]: init_worker_distributed_environment(self.parallel_config, self.rank,
[rank0]: File "/vllm/vllm/worker/worker.py", line 465, in init_worker_distributed_environment
[rank0]: ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
[rank0]: File "/vllm/vllm/distributed/parallel_state.py", line 1091, in ensure_model_parallel_initialized
[rank0]: initialize_model_parallel(tensor_model_parallel_size,
[rank0]: File "/vllm/vllm/distributed/parallel_state.py", line 1055, in initialize_model_parallel
[rank0]: _TP = init_model_parallel_group(group_ranks,
[rank0]: File "/vllm/vllm/distributed/parallel_state.py", line 896, in init_model_parallel_group
[rank0]: return GroupCoordinator(
[rank0]: File "/vllm/vllm/distributed/parallel_state.py", line 233, in __init__
[rank0]: self.ca_comm = CustomAllreduce(
[rank0]: File "/vllm/vllm/distributed/device_communicators/custom_all_reduce.py", line 140, in __init__
[rank0]: if not _can_p2p(rank, world_size):
[rank0]: File "/vllm/vllm/distributed/device_communicators/custom_all_reduce.py", line 35, in _can_p2p
[rank0]: if not gpu_p2p_access_check(rank, i):
[rank0]: File "/vllm/vllm/distributed/device_communicators/custom_all_reduce_utils.py", line 230, in gpu_p2p_access_check
[rank0]: raise RuntimeError(
[rank0]: RuntimeError: Error happened when batch testing peer-to-peer access from (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3) to (0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3):
[rank0]: Traceback (most recent call last):
[rank0]: File "/vllm/vllm/distributed/device_communicators/custom_all_reduce_utils.py", line 14, in <module>
[rank0]: import vllm.envs as envs
[rank0]: ModuleNotFoundError: No module named 'vllm'
Before submitting a new issue...
Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the documentation page, which can answer lots of frequently asked questions.
The text was updated successfully, but these errors were encountered:
Your current environment
How would you like to use vllm
I want to run inference of a Meta-Llama-3.1-70B-Instruct-quantized.w8a16. the shell is :
and i meet the fllow error:
Before submitting a new issue...
The text was updated successfully, but these errors were encountered: