From 6f3d2edda90031b3e02ae96964873b596d16bf67 Mon Sep 17 00:00:00 2001
From: Daniel Hanchen <danielhanchen@users.noreply.github.com>
Date: Mon, 9 Feb 2026 14:12:32 +0000
Subject: [PATCH] Fix sft_prepare_dataset not removing original columns in
 non-packing path

The non-packing tokenization path in sft_prepare_dataset keeps all
original dataset columns (eg messages, text) after the .map(_tokenize)
call. When downstream code like train_on_responses_only replaces the
data collator, Trainer._remove_unused_columns() can be bypassed,
causing the collator to choke on non-tensor columns like raw JSON
strings.

The packing path already handles this correctly via
dataset.select_columns(used_column_names) at line 693. This fix makes
the non-packing path consistent by passing remove_columns to .map(),
which strips original columns from the output while still providing
the full row to the _tokenize function.

Fixes the FunctionGemma (270M) notebook crash:
  ValueError: Unable to create tensor... Perhaps your features
  ('messages' in this case) have excessive nesting
---
 unsloth_zoo/dataset_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth_zoo/dataset_utils.py b/unsloth_zoo/dataset_utils.py
index 1a6d7ddc3..98ad53e3e 100644
--- a/unsloth_zoo/dataset_utils.py
+++ b/unsloth_zoo/dataset_utils.py
@@ -669,7 +669,7 @@ def _tokenize(example):
             map_kwargs["batch_size"] = dataset._ex_iterable.batch_size
             
         if use_desc: map_kwargs["desc"] = f'Unsloth: Tokenizing ["{dataset_text_field}"]'
-        dataset = dataset.map(_tokenize, batched = True, **map_kwargs)
+        dataset = dataset.map(_tokenize, batched = True, remove_columns = list(column_names), **map_kwargs)
 
         # If VLM, switch data collator since .pad is needed!
         if is_vlm and not hasattr(processing_class, "pad"):