Skip to content

[Question] Run this command and there below error python3 -m mlc_llm.build --model llamahf --target android --quantization q4f16_1 #2635

@tombang

Description

@tombang

❓ General Questions

(venv_llama) tangben@tb:~/MLC/mlc-llm$ python3 -m mlc_llm.build --model llamahf --target android --quantization q4f16_1
Using path "dist/models/llamahf" for model "llamahf"
Target configured: opencl -keys=opencl,gpu -max_function_args=128 -max_num_threads=256 -max_shared_memory_per_block=16384 -max_threads_per_block=256 -texture_spatial_limit=16384 -thread_warp_size=1
Automatically using target for weight quantization: vulkan -keys=vulkan,gpu -max_num_threads=256 -max_shared_memory_per_block=32768 -max_threads_per_block=1024 -supports_16bit_buffer=1 -supports_float16=1 -supports_float32=1 -supports_int16=1 -supports_int32=1 -supports_int8=1 -thread_warp_size=1
Get old param: 0%| | 0/197 [00:00<?, ?tensors/sTraceback (most recent call last): | 0/327 [00:00<?, ?tensors/s]
File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/home/tangben/MLC/mlc-llm/mlc_llm/build.py", line 47, in
main()
File "/home/tangben/MLC/mlc-llm/mlc_llm/build.py", line 43, in main
core.build_model_from_args(parsed_args)
File "/home/tangben/MLC/mlc-llm/mlc_llm/core.py", line 866, in build_model_from_args
params = utils.convert_weights(mod_transform, param_manager, params, args)
File "/home/tangben/MLC/mlc-llm/mlc_llm/utils.py", line 282, in convert_weights
ex = relax.build(mod_transform, target=target)
File "/home/tangben/MLC/mlc-llm/venv_llama/lib/python3.10/site-packages/tvm/relax/vm_build.py", line 341, in build
return _vmlink(
File "/home/tangben/MLC/mlc-llm/venv_llama/lib/python3.10/site-packages/tvm/relax/vm_build.py", line 247, in _vmlink
lib = tvm.build(
File "/home/tangben/MLC/mlc-llm/venv_llama/lib/python3.10/site-packages/tvm/driver/build_module.py", line 297, in build
rt_mod_host = _driver_ffi.tir_to_runtime(annotated_mods, target_host)
File "tvm/_ffi/_cython/./packed_func.pxi", line 332, in tvm._ffi._cy3.core.PackedFuncBase.call
File "tvm/_ffi/_cython/./packed_func.pxi", line 263, in tvm._ffi._cy3.core.FuncCall
File "tvm/_ffi/_cython/./packed_func.pxi", line 252, in tvm._ffi._cy3.core.FuncCall3
File "tvm/_ffi/_cython/./base.pxi", line 182, in tvm._ffi._cy3.core.CHECK_CALL
File "/home/tangben/MLC/mlc-llm/venv_llama/lib/python3.10/site-packages/tvm/_ffi/base.py", line 481, in raise_last_ffi_error
raise py_err
tvm._ffi.base.TVMError: Traceback (most recent call last):
11: tvm::runtime::PackedFuncObj::Extractor<tvm::runtime::PackedFuncSubObj<tvm::runtime::TypedPackedFunc<tvm::runtime::Module (tvm::runtime::Map<tvm::Target, tvm::IRModule, void, void> const&, tvm::Target)>::AssignTypedLambda<tvm::__mk_TVM24::{lambda(tvm::runtime::Map<tvm::Target, tvm::IRModule, void, void> const&, tvm::Target)#1}>(tvm::__mk_TVM24::{lambda(tvm::runtime::Map<tvm::Target, tvm::IRModule, void, void> const&, tvm::Target)#1}, std::__cxx11::basic_string<char, std::char_traits, std::allocator >)::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}> >::Call(tvm::runtime::PackedFuncObj const*, std::__cxx11::basic_string<char, std::char_traits, std::allocator >, tvm::runtime::TVMRetValue)
10: tvm::TIRToRuntime(tvm::runtime::Map<tvm::Target, tvm::IRModule, void, void> const&, tvm::Target const&)
9: tvm::SplitMixedModule(tvm::IRModule, tvm::Target const&, tvm::Target const&)
8: tvm::ApplyPasses(tvm::IRModule, tvm::transform::Sequential)
7: tvm::transform::Pass::operator()(tvm::IRModule) const
6: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
5: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
4: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
3: tvm::transform::ModulePassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
2: _ZN3tvm7runtime13PackedFun
1: tvm::runtime::TypedPackedFunc<tvm::IRModule (tvm::IRModule, tvm::transform::PassContext)>::AssignTypedLambda<tvm::tir::transform::VerifyMemory()::{lambda(tvm::IRModule, tvm::transform::PassContext)#1}>(tvm::tir::transform::VerifyMemory()::{lambda(tvm::IRModule, tvm::transform::PassContext)#1})::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}::operator()(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*) const
0: _ZN3tvm7runtime6deta
Did you forget to bind?
Variable scale is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Variable lv is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Variable w_gathered is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Variable w_gathered is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Variable w_gathered is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Variable scale is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Variable lv is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
File "/workspace/tvm/src/tir/analysis/verify_memory.cc", line 205
RuntimeError: Memory verification failed with the following errors:

from tvm.script import tir as T

@T.prim_func
def encode(var_lv: T.handle, var_w_gathered: T.handle, var_scale: T.handle):
T.func_attr({"target": T.target({"host": {"keys": ["cpu"], "kind": "llvm", "mtriple": "x86_64-redhat-linux-gnu", "tag": ""}, "keys": ["vulkan", "gpu"], "kind": "vulkan", "max_num_threads": 256, "max_shared_memory_per_block": 32768, "max_threads_per_block": 1024, "supports_16bit_buffer": 1, "supports_float16": 1, "supports_float32": T.bool(True), "supports_int16": 1, "supports_int32": T.bool(True), "supports_int8": 1, "tag": "", "thread_warp_size": 1}), "tir.is_scheduled": T.bool(True), "tir.noalias": T.bool(True)})
vocab_size = T.int32()
lv = T.match_buffer(var_lv, (vocab_size, 4096), "float16")
w_gathered = T.match_buffer(var_w_gathered, (vocab_size, 512), "uint32")
scale = T.match_buffer(var_scale, (vocab_size, 128), "float16")
max_abs_value = T.allocate([vocab_size * 128], "float16", "global")
max_abs_value_1 = T.Buffer((vocab_size * 128,), "float16", data=max_abs_value)
lv_1 = T.Buffer((vocab_size * 4096,), "float16", data=lv.data)
for i, j, k in T.grid(vocab_size, 128, 32):
cse_var_1: T.int32 = i * 128 + j
if k == 0:
max_abs_value_1[cse_var_1] = T.float16(-65504)
max_abs_value_1[cse_var_1] = T.max(max_abs_value_1[cse_var_1], T.fabs(lv_1[i * 4096 + j * 32 + k]))
scale_1 = T.Buffer((vocab_size * 128,), "float16", data=scale.data)
for i, j in T.grid(vocab_size, 128):
cse_var_2: T.int32 = i * 128 + j
scale_1[cse_var_2] = T.max(max_abs_value_1[cse_var_2], T.float16(0.0001)) * T.float16(0.14285714285714285)
for i, j, k in T.grid(vocab_size, 512, 8):
cse_var_3: T.int32 = i * 512 + j
w_gathered_1 = T.Buffer((vocab_size * 512,), "uint32", data=w_gathered.data)
if k == 0:
w_gathered_1[cse_var_3] = T.uint32(0)
w_gathered_1[cse_var_3] = T.bitwise_or(w_gathered_1[cse_var_3], T.shift_left(T.Cast("uint32", T.min(T.max(T.round(lv_1[i * 4096 + j * 8 + k] / scale_1[i * 128 + j // 4] + T.float16(7)), T.float16(0)), T.float16(14))), T.Cast("uint32", k) * T.uint32(4)))

Metadata

Metadata

Assignees

No one assigned

    Labels

    questionQuestion about the usage

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions