Skip to content

[Bug] CUDA cudaErrorSymbolNotFound during inference with Qwen3-30B-A3B-q4f16_1-MLC model on Tesla P100 #3231

@rankaiyx

Description

@rankaiyx

Description:
When attempting to run inference with the Qwen3-30B-A3B-q4f16_1-MLC model using the command:

mlc_llm serve --overrides "tensor_parallel_shards=4" /home/abc/model/Qwen3-30B-A3B-q4f16_1-MLC  

the process fails immediately with the error:

TVMError: after determining tmp storage requirements for inclusive_scan: cudaErrorSymbolNotFound: named symbol not found  

This issue occurs consistently across multiple runs. However, inference works fine for the Qwen3-32B-q4f16_1-MLC (dense) model using the same configuration.

Environment:

  • CUDA Version: 12.4
  • GPU: 4 x Tesla P100
  • MLC-LLM Version:
    mlc_llm_nightly_cu123-0.20.dev31-cp311-cp311-manylinux_2_28_x86_64.whl
    mlc_ai_nightly_cu123-0.20.dev147-cp311-cp311-manylinux_2_28_x86_64.whl
  • Command:
    mlc_llm serve --overrides "tensor_parallel_shards=4;gpu_memory_utilization=1" /home/abc/model/Qwen3-30B-A3B-q4f16_1-MLC  

Error Logs:
Full error stack trace:

Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/home/abc/mlc-llm/lib/python3.11/site-packages/mlc_llm/cli/worker.py", line 58, in <module>
    main()
  File "/home/abc/mlc-llm/lib/python3.11/site-packages/mlc_llm/cli/worker.py", line 53, in main
    worker_func(worker_id, num_workers, num_groups, reader, writer)
  File "tvm/_ffi/_cython/./packed_func.pxi", line 339, in tvm._ffi._cy3.core.PackedFuncBase.__call__
  File "tvm/_ffi/_cython/./packed_func.pxi", line 284, in tvm._ffi._cy3.core.FuncCall
  File "tvm/_ffi/_cython/./base.pxi", line 185, in tvm._ffi._cy3.core.CHECK_CALL
  File "/home/abc/mlc-llm/lib/python3.11/site-packages/tvm/_ffi/base.py", line 468, in raise_last_ffi_error
    raise py_err
tvm._ffi.base.TVMError: Traceback (most recent call last):
  11: tvm::runtime::PackedFuncObj::Extractor<tvm::runtime::PackedFuncSubObj<tvm::runtime::TypedPackedFunc<void (int, int, int, long, long)>::AssignTypedLambda<void (*)(int, int, int, long, long)>(void (*)(int, int, int, long, long), std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >)::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}> >::Call(tvm::runtime::PackedFuncObj const*, tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)
  10: tvm::runtime::WorkerProcess(int, int, int, long, long)
  9: tvm::runtime::DiscoWorker::Impl::MainLoop(tvm::runtime::DiscoWorker*)
  8: tvm::runtime::DiscoWorker::Impl::CallPacked(tvm::runtime::DiscoWorker*, long, tvm::runtime::PackedFunc, tvm::runtime::TVMArgs const&)
  7: tvm::runtime::relax_vm::VirtualMachineImpl::InvokeClosurePacked(tvm::runtime::ObjectRef const&, tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)
  6: tvm::runtime::PackedFuncObj::Extractor<tvm::runtime::PackedFuncSubObj<tvm::runtime::relax_vm::VirtualMachineImpl::GetClosureInternal(tvm::runtime::String const&, bool)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#1}> >::Call(tvm::runtime::PackedFuncObj const*, tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)
  5: tvm::runtime::relax_vm::VirtualMachineImpl::InvokeBytecode(long, std::vector<tvm::runtime::TVMRetValue, std::allocator<tvm::runtime::TVMRetValue> > const&)
  4: tvm::runtime::relax_vm::VirtualMachineImpl::RunLoop()
  3: tvm::runtime::relax_vm::VirtualMachineImpl::RunInstrCall(tvm::runtime::relax_vm::VMFrame*, tvm::runtime::relax_vm::Instruction)
  2: tvm::runtime::PackedFuncObj::Extractor<tvm::runtime::PackedFuncSubObj<tvm::runtime::relax_vm::__mk_TVM14::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#1}> >::Call(tvm::runtime::PackedFuncObj const*, tvm::runtime::relax_vm::__mk_TVM14, tvm::runtime::TVMRetValue)
  1: tvm::runtime::PackedFuncObj::Extractor<tvm::runtime::PackedFuncSubObj<tvm::runtime::WrapPackedFunc(int (*)(TVMValue*, int*, int, TVMValue*, int*, void*), tvm::runtime::ObjectPtr<tvm::runtime::Object> const&)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#1}> >::Call(tvm::runtime::PackedFuncObj const*, tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)
  0: TVMThrowLastError.cold
TVMError: after determining tmp storage requirements for inclusive_scan: cudaErrorSymbolNotFound: named symbol not foundTraceback (most recent call last):

  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/home/abc/mlc-llm/lib/python3.11/site-packages/mlc_llm/cli/worker.py", line 58, in <module>
    main()
  File "/home/abc/mlc-llm/lib/python3.11/site-packages/mlc_llm/cli/worker.py", line 53, in main
    worker_func(worker_id, num_workers, num_groups, reader, writer)
  File "tvm/_ffi/_cython/./packed_func.pxi", line 339, in tvm._ffi._cy3.core.PackedFuncBase.__call__
  File "tvm/_ffi/_cython/./packed_func.pxi", line 284, in tvm._ffi._cy3.core.FuncCall
Traceback (most recent call last):
  File "tvm/_ffi/_cython/./base.pxi", line 185, in tvm._ffi._cy3.core.CHECK_CALL
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/home/abc/mlc-llm/lib/python3.11/site-packages/mlc_llm/cli/worker.py", line 58, in <module>
  File "/home/abc/mlc-llm/lib/python3.11/site-packages/tvm/_ffi/base.py", line 468, in raise_last_ffi_error
    main()
  File "/home/abc/mlc-llm/lib/python3.11/site-packages/mlc_llm/cli/worker.py", line 53, in main
    worker_func(worker_id, num_workers, num_groups, reader, writer)
    raise py_err
tvm._ffi.base.TVMError: Traceback (most recent call last):
  11: tvm::runtime::PackedFuncObj::Extractor<tvm::runtime::PackedFuncSubObj<tvm::runtime::TypedPackedFunc<void (int, int, int, long, long)>::AssignTypedLambda<void (*)(int, int, int, long, long)>(void (*)(int, int, int, long, long), std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >)::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}> >::Call(tvm::runtime::PackedFuncObj const*, tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)
  10: tvm::runtime::WorkerProcess(int, int, int, long, long)
  9: tvm::runtime::DiscoWorker::Impl::MainLoop(tvm::runtime::DiscoWorker*)
  8: tvm::runtime::DiscoWorker::Impl::CallPacked(tvm::runtime::DiscoWorker*, long, tvm::runtime::PackedFunc, tvm::runtime::TVMArgs const&)
  7: tvm::runtime::relax_vm::VirtualMachineImpl::InvokeClosurePacked(tvm::runtime::ObjectRef const&, tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)
  6: tvm::runtime::PackedFuncObj::Extractor<tvm::runtime::PackedFuncSubObj<tvm::runtime::relax_vm::VirtualMachineImpl::GetClosureInternal(tvm::runtime::String const&, bool)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#1}> >::Call(tvm::runtime::PackedFuncObj const*, tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)
  5: tvm::runtime::relax_vm::VirtualMachineImpl::InvokeBytecode(long, std::vector<tvm::runtime::TVMRetValue, std::allocator<tvm::runtime::TVMRetValue> > const&)
  4: tvm::runtime::relax_vm::VirtualMachineImpl::RunLoop()
  3: tvm::runtime::relax_vm::VirtualMachineImpl::RunInstrCall(tvm::runtime::relax_vm::VMFrame*, tvm::runtime::relax_vm::Instruction)
  2: tvm::runtime::PackedFuncObj::Extractor<tvm::runtime::PackedFuncSubObj<tvm::runtime::relax_vm::__mk_TVM14::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#1}> >::Call(tvm::runtime::PackedFuncObj const*, tvm::runtime::relax_vm::__mk_TVM14, tvm::runtime::TVMRetValue)
  1: tvm::runtime::PackedFuncObj::Extractor<tvm::runtime::PackedFuncSubObj<tvm::runtime::WrapPackedFunc(int (*)(TVMValue*, int*, int, TVMValue*, int*, void*), tvm::runtime::ObjectPtr<tvm::runtime::Object> const&)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#1}> >::Call(tvm::runtime::PackedFuncObj const*, tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)
  0: TVMThrowLastError.cold
TVMError: after determining tmp storage requirements for inclusive_scan: cudaErrorSymbolNotFound: named symbol not found  File "tvm/_ffi/_cython/./packed_func.pxi", line 339, in tvm._ffi._cy3.core.PackedFuncBase.__call__

  File "tvm/_ffi/_cython/./packed_func.pxi", line 284, in tvm._ffi._cy3.core.FuncCall
  File "tvm/_ffi/_cython/./base.pxi", line 185, in tvm._ffi._cy3.core.CHECK_CALL
  File "/home/abc/mlc-llm/lib/python3.11/site-packages/tvm/_ffi/base.py", line 468, in raise_last_ffi_error
    raise py_err
tvm._ffi.base.TVMError: Traceback (most recent call last):
  11: tvm::runtime::PackedFuncObj::Extractor<tvm::runtime::PackedFuncSubObj<tvm::runtime::TypedPackedFunc<void (int, int, int, long, long)>::AssignTypedLambda<void (*)(int, int, int, long, long)>(void (*)(int, int, int, long, long), std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >)::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}> >::Call(tvm::runtime::PackedFuncObj const*, tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)
  10: tvm::runtime::WorkerProcess(int, int, int, long, long)
  9: tvm::runtime::DiscoWorker::Impl::MainLoop(tvm::runtime::DiscoWorker*)
  8: tvm::runtime::DiscoWorker::Impl::CallPacked(tvm::runtime::DiscoWorker*, long, tvm::runtime::PackedFunc, tvm::runtime::TVMArgs const&)
  7: tvm::runtime::relax_vm::VirtualMachineImpl::InvokeClosurePacked(tvm::runtime::ObjectRef const&, tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)
  6: tvm::runtime::PackedFuncObj::Extractor<tvm::runtime::PackedFuncSubObj<tvm::runtime::relax_vm::VirtualMachineImpl::GetClosureInternal(tvm::runtime::String const&, bool)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#1}> >::Call(tvm::runtime::PackedFuncObj const*, tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)
  5: tvm::runtime::relax_vm::VirtualMachineImpl::InvokeBytecode(long, std::vector<tvm::runtime::TVMRetValue, std::allocator<tvm::runtime::TVMRetValue> > const&)
  4: tvm::runtime::relax_vm::VirtualMachineImpl::RunLoop()
  3: tvm::runtime::relax_vm::VirtualMachineImpl::RunInstrCall(tvm::runtime::relax_vm::VMFrame*, tvm::runtime::relax_vm::Instruction)
  2: tvm::runtime::PackedFuncObj::Extractor<tvm::runtime::PackedFuncSubObj<tvm::runtime::relax_vm::__mk_TVM14::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#1}> >::Call(tvm::runtime::PackedFuncObj const*, tvm::runtime::relax_vm::__mk_TVM14, tvm::runtime::TVMRetValue)
  1: tvm::runtime::PackedFuncObj::Extractor<tvm::runtime::PackedFuncSubObj<tvm::runtime::WrapPackedFunc(int (*)(TVMValue*, int*, int, TVMValue*, int*, void*), tvm::runtime::ObjectPtr<tvm::runtime::Object> const&)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#1}> >::Call(tvm::runtime::PackedFuncObj const*, tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)
  0: TVMThrowLastError.cold
TVMError: after determining tmp storage requirements for inclusive_scan: cudaErrorSymbolNotFound: named symbol not found
[1]    76422 IOT instruction (core dumped)  mlc_llm chat --overrides "tensor_parallel_shards=4"

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugConfirmed bugs

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions