Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions conversion/qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
if name.find("layers.") != -1:
assert bid is not None
name = name.replace(f"mtp.layers.{bid}", f"model.layers.{bid + n_layer}")
# Keep bid coherent with the remapped name so downstream MoE
# expert-stacking (Qwen2MoeModel) caches and reads at the same
# layer slot. Without this, self._experts[0] gets populated with
# layer-48 names, then the stacker builds layer-0 lookup keys
# and KeyErrors.
Comment thread
bit-incarnas marked this conversation as resolved.
Outdated
bid = bid + n_layer
else:
remapper = {
"mtp.fc": "model.layers.{bid}.eh_proj",
Expand Down