[Question]  Run this command and there below error python3 -m mlc_llm.build --model llamahf --target android --quantization q4f16_1

## ❓ General Questions

(venv_llama) tangben@tb:~/MLC/mlc-llm$ python3 -m mlc_llm.build --model llamahf --target android --quantization q4f16_1
Using path "dist/models/llamahf" for model "llamahf"
Target configured: opencl -keys=opencl,gpu -max_function_args=128 -max_num_threads=256 -max_shared_memory_per_block=16384 -max_threads_per_block=256 -texture_spatial_limit=16384 -thread_warp_size=1
Automatically using target for weight quantization: vulkan -keys=vulkan,gpu -max_num_threads=256 -max_shared_memory_per_block=32768 -max_threads_per_block=1024 -supports_16bit_buffer=1 -supports_float16=1 -supports_float32=1 -supports_int16=1 -supports_int32=1 -supports_int8=1 -thread_warp_size=1
Get old param:   0%|                                                                                                                                                                                                                                  | 0/197 [00:00<?, ?tensors/sTraceback (most recent call last):                                                                                                                                                                                                                     | 0/327 [00:00<?, ?tensors/s]
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/tangben/MLC/mlc-llm/mlc_llm/build.py", line 47, in <module>
    main()
  File "/home/tangben/MLC/mlc-llm/mlc_llm/build.py", line 43, in main
    core.build_model_from_args(parsed_args)
  File "/home/tangben/MLC/mlc-llm/mlc_llm/core.py", line 866, in build_model_from_args
    params = utils.convert_weights(mod_transform, param_manager, params, args)
  File "/home/tangben/MLC/mlc-llm/mlc_llm/utils.py", line 282, in convert_weights
    ex = relax.build(mod_transform, target=target)
  File "/home/tangben/MLC/mlc-llm/venv_llama/lib/python3.10/site-packages/tvm/relax/vm_build.py", line 341, in build
    return _vmlink(
  File "/home/tangben/MLC/mlc-llm/venv_llama/lib/python3.10/site-packages/tvm/relax/vm_build.py", line 247, in _vmlink
    lib = tvm.build(
  File "/home/tangben/MLC/mlc-llm/venv_llama/lib/python3.10/site-packages/tvm/driver/build_module.py", line 297, in build
    rt_mod_host = _driver_ffi.tir_to_runtime(annotated_mods, target_host)
  File "tvm/_ffi/_cython/./packed_func.pxi", line 332, in tvm._ffi._cy3.core.PackedFuncBase.__call__
  File "tvm/_ffi/_cython/./packed_func.pxi", line 263, in tvm._ffi._cy3.core.FuncCall
  File "tvm/_ffi/_cython/./packed_func.pxi", line 252, in tvm._ffi._cy3.core.FuncCall3
  File "tvm/_ffi/_cython/./base.pxi", line 182, in tvm._ffi._cy3.core.CHECK_CALL
  File "/home/tangben/MLC/mlc-llm/venv_llama/lib/python3.10/site-packages/tvm/_ffi/base.py", line 481, in raise_last_ffi_error
    raise py_err
tvm._ffi.base.TVMError: Traceback (most recent call last):
  11: tvm::runtime::PackedFuncObj::Extractor<tvm::runtime::PackedFuncSubObj<tvm::runtime::TypedPackedFunc<tvm::runtime::Module (tvm::runtime::Map<tvm::Target, tvm::IRModule, void, void> const&, tvm::Target)>::AssignTypedLambda<tvm::__mk_TVM24::{lambda(tvm::runtime::Map<tvm::Target, tvm::IRModule, void, void> const&, tvm::Target)#1}>(tvm::__mk_TVM24::{lambda(tvm::runtime::Map<tvm::Target, tvm::IRModule, void, void> const&, tvm::Target)#1}, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >)::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}> >::Call(tvm::runtime::PackedFuncObj const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tvm::runtime::TVMRetValue)
  10: tvm::TIRToRuntime(tvm::runtime::Map<tvm::Target, tvm::IRModule, void, void> const&, tvm::Target const&)
  9: tvm::SplitMixedModule(tvm::IRModule, tvm::Target const&, tvm::Target const&)
  8: tvm::ApplyPasses(tvm::IRModule, tvm::transform::Sequential)
  7: tvm::transform::Pass::operator()(tvm::IRModule) const
  6: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
  5: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
  4: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
  3: tvm::transform::ModulePassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
  2: _ZN3tvm7runtime13PackedFun
  1: tvm::runtime::TypedPackedFunc<tvm::IRModule (tvm::IRModule, tvm::transform::PassContext)>::AssignTypedLambda<tvm::tir::transform::VerifyMemory()::{lambda(tvm::IRModule, tvm::transform::PassContext)#1}>(tvm::tir::transform::VerifyMemory()::{lambda(tvm::IRModule, tvm::transform::PassContext)#1})::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}::operator()(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*) const
  0: _ZN3tvm7runtime6deta
  Did you forget to bind?
    Variable `scale` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
    Variable `lv` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
    Variable `w_gathered` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
    Variable `w_gathered` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
    Variable `w_gathered` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
    Variable `scale` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
    Variable `lv` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
  File "/workspace/tvm/src/tir/analysis/verify_memory.cc", line 205
RuntimeError: Memory verification failed with the following errors:
# from tvm.script import tir as T

@T.prim_func
def encode(var_lv: T.handle, var_w_gathered: T.handle, var_scale: T.handle):
    T.func_attr({"target": T.target({"host": {"keys": ["cpu"], "kind": "llvm", "mtriple": "x86_64-redhat-linux-gnu", "tag": ""}, "keys": ["vulkan", "gpu"], "kind": "vulkan", "max_num_threads": 256, "max_shared_memory_per_block": 32768, "max_threads_per_block": 1024, "supports_16bit_buffer": 1, "supports_float16": 1, "supports_float32": T.bool(True), "supports_int16": 1, "supports_int32": T.bool(True), "supports_int8": 1, "tag": "", "thread_warp_size": 1}), "tir.is_scheduled": T.bool(True), "tir.noalias": T.bool(True)})
    vocab_size = T.int32()
    lv = T.match_buffer(var_lv, (vocab_size, 4096), "float16")
    w_gathered = T.match_buffer(var_w_gathered, (vocab_size, 512), "uint32")
    scale = T.match_buffer(var_scale, (vocab_size, 128), "float16")
    max_abs_value = T.allocate([vocab_size * 128], "float16", "global")
    max_abs_value_1 = T.Buffer((vocab_size * 128,), "float16", data=max_abs_value)
    lv_1 = T.Buffer((vocab_size * 4096,), "float16", data=lv.data)
    for i, j, k in T.grid(vocab_size, 128, 32):
        cse_var_1: T.int32 = i * 128 + j
        if k == 0:
            max_abs_value_1[cse_var_1] = T.float16(-65504)
        max_abs_value_1[cse_var_1] = T.max(max_abs_value_1[cse_var_1], T.fabs(lv_1[i * 4096 + j * 32 + k]))
    scale_1 = T.Buffer((vocab_size * 128,), "float16", data=scale.data)
    for i, j in T.grid(vocab_size, 128):
        cse_var_2: T.int32 = i * 128 + j
        scale_1[cse_var_2] = T.max(max_abs_value_1[cse_var_2], T.float16(0.0001)) * T.float16(0.14285714285714285)
    for i, j, k in T.grid(vocab_size, 512, 8):
        cse_var_3: T.int32 = i * 512 + j
        w_gathered_1 = T.Buffer((vocab_size * 512,), "uint32", data=w_gathered.data)
        if k == 0:
            w_gathered_1[cse_var_3] = T.uint32(0)
        w_gathered_1[cse_var_3] = T.bitwise_or(w_gathered_1[cse_var_3], T.shift_left(T.Cast("uint32", T.min(T.max(T.round(lv_1[i * 4096 + j * 8 + k] / scale_1[i * 128 + j // 4] + T.float16(7)), T.float16(0)), T.float16(14))), T.Cast("uint32", k) * T.uint32(4)))




Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

[Question] Run this command and there below error python3 -m mlc_llm.build --model llamahf --target android --quantization q4f16_1 #2635

❓ General Questions

from tvm.script import tir as T

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Uh oh!

[Question] Run this command and there below error python3 -m mlc_llm.build --model llamahf --target android --quantization q4f16_1 #2635

Description

❓ General Questions

from tvm.script import tir as T

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions