From 6f3d2edda90031b3e02ae96964873b596d16bf67 Mon Sep 17 00:00:00 2001 From: Daniel Hanchen Date: Mon, 9 Feb 2026 14:12:32 +0000 Subject: [PATCH] Fix sft_prepare_dataset not removing original columns in non-packing path The non-packing tokenization path in sft_prepare_dataset keeps all original dataset columns (eg messages, text) after the .map(_tokenize) call. When downstream code like train_on_responses_only replaces the data collator, Trainer._remove_unused_columns() can be bypassed, causing the collator to choke on non-tensor columns like raw JSON strings. The packing path already handles this correctly via dataset.select_columns(used_column_names) at line 693. This fix makes the non-packing path consistent by passing remove_columns to .map(), which strips original columns from the output while still providing the full row to the _tokenize function. Fixes the FunctionGemma (270M) notebook crash: ValueError: Unable to create tensor... Perhaps your features ('messages' in this case) have excessive nesting --- unsloth_zoo/dataset_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth_zoo/dataset_utils.py b/unsloth_zoo/dataset_utils.py index 1a6d7ddc3..98ad53e3e 100644 --- a/unsloth_zoo/dataset_utils.py +++ b/unsloth_zoo/dataset_utils.py @@ -669,7 +669,7 @@ def _tokenize(example): map_kwargs["batch_size"] = dataset._ex_iterable.batch_size if use_desc: map_kwargs["desc"] = f'Unsloth: Tokenizing ["{dataset_text_field}"]' - dataset = dataset.map(_tokenize, batched = True, **map_kwargs) + dataset = dataset.map(_tokenize, batched = True, remove_columns = list(column_names), **map_kwargs) # If VLM, switch data collator since .pad is needed! if is_vlm and not hasattr(processing_class, "pad"):