-
Couldn't load subscription status.
- Fork 1.8k
Description
❓ General Questions
(venv_llama) tangben@tb:~/MLC/mlc-llm$ python3 -m mlc_llm.build --model llamahf --target android --quantization q4f16_1
Using path "dist/models/llamahf" for model "llamahf"
Target configured: opencl -keys=opencl,gpu -max_function_args=128 -max_num_threads=256 -max_shared_memory_per_block=16384 -max_threads_per_block=256 -texture_spatial_limit=16384 -thread_warp_size=1
Automatically using target for weight quantization: vulkan -keys=vulkan,gpu -max_num_threads=256 -max_shared_memory_per_block=32768 -max_threads_per_block=1024 -supports_16bit_buffer=1 -supports_float16=1 -supports_float32=1 -supports_int16=1 -supports_int32=1 -supports_int8=1 -thread_warp_size=1
Get old param: 0%| | 0/197 [00:00<?, ?tensors/sTraceback (most recent call last): | 0/327 [00:00<?, ?tensors/s]
File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/home/tangben/MLC/mlc-llm/mlc_llm/build.py", line 47, in
main()
File "/home/tangben/MLC/mlc-llm/mlc_llm/build.py", line 43, in main
core.build_model_from_args(parsed_args)
File "/home/tangben/MLC/mlc-llm/mlc_llm/core.py", line 866, in build_model_from_args
params = utils.convert_weights(mod_transform, param_manager, params, args)
File "/home/tangben/MLC/mlc-llm/mlc_llm/utils.py", line 282, in convert_weights
ex = relax.build(mod_transform, target=target)
File "/home/tangben/MLC/mlc-llm/venv_llama/lib/python3.10/site-packages/tvm/relax/vm_build.py", line 341, in build
return _vmlink(
File "/home/tangben/MLC/mlc-llm/venv_llama/lib/python3.10/site-packages/tvm/relax/vm_build.py", line 247, in _vmlink
lib = tvm.build(
File "/home/tangben/MLC/mlc-llm/venv_llama/lib/python3.10/site-packages/tvm/driver/build_module.py", line 297, in build
rt_mod_host = _driver_ffi.tir_to_runtime(annotated_mods, target_host)
File "tvm/_ffi/_cython/./packed_func.pxi", line 332, in tvm._ffi._cy3.core.PackedFuncBase.call
File "tvm/_ffi/_cython/./packed_func.pxi", line 263, in tvm._ffi._cy3.core.FuncCall
File "tvm/_ffi/_cython/./packed_func.pxi", line 252, in tvm._ffi._cy3.core.FuncCall3
File "tvm/_ffi/_cython/./base.pxi", line 182, in tvm._ffi._cy3.core.CHECK_CALL
File "/home/tangben/MLC/mlc-llm/venv_llama/lib/python3.10/site-packages/tvm/_ffi/base.py", line 481, in raise_last_ffi_error
raise py_err
tvm._ffi.base.TVMError: Traceback (most recent call last):
11: tvm::runtime::PackedFuncObj::Extractor<tvm::runtime::PackedFuncSubObj<tvm::runtime::TypedPackedFunc<tvm::runtime::Module (tvm::runtime::Map<tvm::Target, tvm::IRModule, void, void> const&, tvm::Target)>::AssignTypedLambda<tvm::__mk_TVM24::{lambda(tvm::runtime::Map<tvm::Target, tvm::IRModule, void, void> const&, tvm::Target)#1}>(tvm::__mk_TVM24::{lambda(tvm::runtime::Map<tvm::Target, tvm::IRModule, void, void> const&, tvm::Target)#1}, std::__cxx11::basic_string<char, std::char_traits, std::allocator >)::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}> >::Call(tvm::runtime::PackedFuncObj const*, std::__cxx11::basic_string<char, std::char_traits, std::allocator >, tvm::runtime::TVMRetValue)
10: tvm::TIRToRuntime(tvm::runtime::Map<tvm::Target, tvm::IRModule, void, void> const&, tvm::Target const&)
9: tvm::SplitMixedModule(tvm::IRModule, tvm::Target const&, tvm::Target const&)
8: tvm::ApplyPasses(tvm::IRModule, tvm::transform::Sequential)
7: tvm::transform::Pass::operator()(tvm::IRModule) const
6: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
5: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
4: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
3: tvm::transform::ModulePassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
2: _ZN3tvm7runtime13PackedFun
1: tvm::runtime::TypedPackedFunc<tvm::IRModule (tvm::IRModule, tvm::transform::PassContext)>::AssignTypedLambda<tvm::tir::transform::VerifyMemory()::{lambda(tvm::IRModule, tvm::transform::PassContext)#1}>(tvm::tir::transform::VerifyMemory()::{lambda(tvm::IRModule, tvm::transform::PassContext)#1})::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}::operator()(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*) const
0: _ZN3tvm7runtime6deta
Did you forget to bind?
Variable scale is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Variable lv is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Variable w_gathered is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Variable w_gathered is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Variable w_gathered is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Variable scale is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
Variable lv is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
File "/workspace/tvm/src/tir/analysis/verify_memory.cc", line 205
RuntimeError: Memory verification failed with the following errors:
from tvm.script import tir as T
@T.prim_func
def encode(var_lv: T.handle, var_w_gathered: T.handle, var_scale: T.handle):
T.func_attr({"target": T.target({"host": {"keys": ["cpu"], "kind": "llvm", "mtriple": "x86_64-redhat-linux-gnu", "tag": ""}, "keys": ["vulkan", "gpu"], "kind": "vulkan", "max_num_threads": 256, "max_shared_memory_per_block": 32768, "max_threads_per_block": 1024, "supports_16bit_buffer": 1, "supports_float16": 1, "supports_float32": T.bool(True), "supports_int16": 1, "supports_int32": T.bool(True), "supports_int8": 1, "tag": "", "thread_warp_size": 1}), "tir.is_scheduled": T.bool(True), "tir.noalias": T.bool(True)})
vocab_size = T.int32()
lv = T.match_buffer(var_lv, (vocab_size, 4096), "float16")
w_gathered = T.match_buffer(var_w_gathered, (vocab_size, 512), "uint32")
scale = T.match_buffer(var_scale, (vocab_size, 128), "float16")
max_abs_value = T.allocate([vocab_size * 128], "float16", "global")
max_abs_value_1 = T.Buffer((vocab_size * 128,), "float16", data=max_abs_value)
lv_1 = T.Buffer((vocab_size * 4096,), "float16", data=lv.data)
for i, j, k in T.grid(vocab_size, 128, 32):
cse_var_1: T.int32 = i * 128 + j
if k == 0:
max_abs_value_1[cse_var_1] = T.float16(-65504)
max_abs_value_1[cse_var_1] = T.max(max_abs_value_1[cse_var_1], T.fabs(lv_1[i * 4096 + j * 32 + k]))
scale_1 = T.Buffer((vocab_size * 128,), "float16", data=scale.data)
for i, j in T.grid(vocab_size, 128):
cse_var_2: T.int32 = i * 128 + j
scale_1[cse_var_2] = T.max(max_abs_value_1[cse_var_2], T.float16(0.0001)) * T.float16(0.14285714285714285)
for i, j, k in T.grid(vocab_size, 512, 8):
cse_var_3: T.int32 = i * 512 + j
w_gathered_1 = T.Buffer((vocab_size * 512,), "uint32", data=w_gathered.data)
if k == 0:
w_gathered_1[cse_var_3] = T.uint32(0)
w_gathered_1[cse_var_3] = T.bitwise_or(w_gathered_1[cse_var_3], T.shift_left(T.Cast("uint32", T.min(T.max(T.round(lv_1[i * 4096 + j * 8 + k] / scale_1[i * 128 + j // 4] + T.float16(7)), T.float16(0)), T.float16(14))), T.Cast("uint32", k) * T.uint32(4)))