diff --git a/nb/Advanced_Llama3_2_(3B)_GRPO_LoRA.ipynb b/nb/Advanced_Llama3_2_(3B)_GRPO_LoRA.ipynb index 5a0469e4..8b259223 100644 --- a/nb/Advanced_Llama3_2_(3B)_GRPO_LoRA.ipynb +++ b/nb/Advanced_Llama3_2_(3B)_GRPO_LoRA.ipynb @@ -1270,7 +1270,7 @@ "from trl import GRPOConfig, GRPOTrainer\n", "training_args = GRPOConfig(\n", " learning_rate = 5e-6,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"adamw_8bit\",\n", diff --git a/nb/Falcon_H1_(0.5B)-Alpaca.ipynb b/nb/Falcon_H1_(0.5B)-Alpaca.ipynb index c74d1b66..86a999b3 100644 --- a/nb/Falcon_H1_(0.5B)-Alpaca.ipynb +++ b/nb/Falcon_H1_(0.5B)-Alpaca.ipynb @@ -419,7 +419,7 @@ " learning_rate = 2e-4,\n", " logging_steps = 1,\n", " optim = \"adamw_8bit\",\n", - " weight_decay = 0.01,\n", + " weight_decay = 0.001,\n", " lr_scheduler_type = \"linear\",\n", " seed = 3407,\n", " output_dir = \"outputs\",\n", diff --git a/nb/Gemma2_(2B)-Alpaca.ipynb b/nb/Gemma2_(2B)-Alpaca.ipynb index 5a60d92a..a139add1 100644 --- a/nb/Gemma2_(2B)-Alpaca.ipynb +++ b/nb/Gemma2_(2B)-Alpaca.ipynb @@ -563,7 +563,7 @@ " learning_rate = 2e-4,\n", " logging_steps = 1,\n", " optim = \"adamw_8bit\",\n", - " weight_decay = 0.01,\n", + " weight_decay = 0.001,\n", " lr_scheduler_type = \"linear\",\n", " seed = 3407,\n", " output_dir = \"outputs\",\n", diff --git a/nb/Gemma3_(1B)-GRPO.ipynb b/nb/Gemma3_(1B)-GRPO.ipynb index 802fc0b9..84cbea0d 100644 --- a/nb/Gemma3_(1B)-GRPO.ipynb +++ b/nb/Gemma3_(1B)-GRPO.ipynb @@ -1076,7 +1076,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"adamw_torch_fused\",\n", diff --git a/nb/Gemma3_(4B)-Vision-GRPO.ipynb b/nb/Gemma3_(4B)-Vision-GRPO.ipynb index 53802c27..4604e72e 100644 --- a/nb/Gemma3_(4B)-Vision-GRPO.ipynb +++ b/nb/Gemma3_(4B)-Vision-GRPO.ipynb @@ -1149,7 +1149,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"adamw_8bit\",\n", diff --git a/nb/HuggingFace Course-Advanced_Llama3_1_(3B)_GRPO_LoRA.ipynb b/nb/HuggingFace Course-Advanced_Llama3_1_(3B)_GRPO_LoRA.ipynb index efa2cb8d..65c87883 100644 --- a/nb/HuggingFace Course-Advanced_Llama3_1_(3B)_GRPO_LoRA.ipynb +++ b/nb/HuggingFace Course-Advanced_Llama3_1_(3B)_GRPO_LoRA.ipynb @@ -832,7 +832,7 @@ "from trl import GRPOConfig, GRPOTrainer\n", "training_args = GRPOConfig(\n", " learning_rate = 5e-6,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"adamw_torch_fused\",\n", diff --git a/nb/HuggingFace Course-Advanced_Llama3_2_(3B)_GRPO_LoRA.ipynb b/nb/HuggingFace Course-Advanced_Llama3_2_(3B)_GRPO_LoRA.ipynb index 76235ec7..139cf1bb 100644 --- a/nb/HuggingFace Course-Advanced_Llama3_2_(3B)_GRPO_LoRA.ipynb +++ b/nb/HuggingFace Course-Advanced_Llama3_2_(3B)_GRPO_LoRA.ipynb @@ -1270,7 +1270,7 @@ "from trl import GRPOConfig, GRPOTrainer\n", "training_args = GRPOConfig(\n", " learning_rate = 5e-6,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"adamw_8bit\",\n", diff --git a/nb/HuggingFace Course-Gemma3_(1B)-GRPO.ipynb b/nb/HuggingFace Course-Gemma3_(1B)-GRPO.ipynb index 5e45472d..412b9fbc 100644 --- a/nb/HuggingFace Course-Gemma3_(1B)-GRPO.ipynb +++ b/nb/HuggingFace Course-Gemma3_(1B)-GRPO.ipynb @@ -1078,7 +1078,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"adamw_torch_fused\",\n", diff --git a/nb/HuggingFace Course-Gemma3_(4B)-Vision-GRPO.ipynb b/nb/HuggingFace Course-Gemma3_(4B)-Vision-GRPO.ipynb index 21587611..708390f4 100644 --- a/nb/HuggingFace Course-Gemma3_(4B)-Vision-GRPO.ipynb +++ b/nb/HuggingFace Course-Gemma3_(4B)-Vision-GRPO.ipynb @@ -1151,7 +1151,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"adamw_8bit\",\n", diff --git a/nb/HuggingFace Course-Llama3.1_(8B)-GRPO.ipynb b/nb/HuggingFace Course-Llama3.1_(8B)-GRPO.ipynb index 29549985..7d34be62 100644 --- a/nb/HuggingFace Course-Llama3.1_(8B)-GRPO.ipynb +++ b/nb/HuggingFace Course-Llama3.1_(8B)-GRPO.ipynb @@ -769,7 +769,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"paged_adamw_8bit\",\n", diff --git a/nb/HuggingFace Course-Llama_FP8_GRPO.ipynb b/nb/HuggingFace Course-Llama_FP8_GRPO.ipynb index d38a7060..9b9f30d5 100644 --- a/nb/HuggingFace Course-Llama_FP8_GRPO.ipynb +++ b/nb/HuggingFace Course-Llama_FP8_GRPO.ipynb @@ -1307,7 +1307,7 @@ " learning_rate = 2e-4, # Reduce to 2e-5 for long training runs\n", " logging_steps = 5,\n", " optim = \"adamw_8bit\",\n", - " weight_decay = 0.01,\n", + " weight_decay = 0.001,\n", " lr_scheduler_type = \"linear\",\n", " seed = 3407,\n", " report_to = \"none\", # Use this for WandB etc\n", @@ -2520,7 +2520,7 @@ " vllm_sampling_params = vllm_sampling_params,\n", " temperature = 1.0,\n", " learning_rate = 5e-6,\n", - " weight_decay = 0.01,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"linear\",\n", " optim = \"adamw_8bit\",\n", diff --git a/nb/HuggingFace Course-Mistral_v0.3_(7B)-GRPO.ipynb b/nb/HuggingFace Course-Mistral_v0.3_(7B)-GRPO.ipynb index e8d53937..adaff0e3 100644 --- a/nb/HuggingFace Course-Mistral_v0.3_(7B)-GRPO.ipynb +++ b/nb/HuggingFace Course-Mistral_v0.3_(7B)-GRPO.ipynb @@ -613,7 +613,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"paged_adamw_8bit\",\n", diff --git a/nb/HuggingFace Course-Phi_4_(14B)-GRPO.ipynb b/nb/HuggingFace Course-Phi_4_(14B)-GRPO.ipynb index 9693df93..93b9becb 100644 --- a/nb/HuggingFace Course-Phi_4_(14B)-GRPO.ipynb +++ b/nb/HuggingFace Course-Phi_4_(14B)-GRPO.ipynb @@ -391,7 +391,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"paged_adamw_8bit\",\n", diff --git a/nb/HuggingFace Course-Qwen2.5_(3B)-GRPO.ipynb b/nb/HuggingFace Course-Qwen2.5_(3B)-GRPO.ipynb index eae7b9b5..003742e8 100644 --- a/nb/HuggingFace Course-Qwen2.5_(3B)-GRPO.ipynb +++ b/nb/HuggingFace Course-Qwen2.5_(3B)-GRPO.ipynb @@ -908,7 +908,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"adamw_8bit\",\n", diff --git a/nb/HuggingFace Course-Qwen2_5_7B_VL_GRPO.ipynb b/nb/HuggingFace Course-Qwen2_5_7B_VL_GRPO.ipynb index 85cfbed9..b7f50bcc 100644 --- a/nb/HuggingFace Course-Qwen2_5_7B_VL_GRPO.ipynb +++ b/nb/HuggingFace Course-Qwen2_5_7B_VL_GRPO.ipynb @@ -1402,7 +1402,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"adamw_8bit\",\n", diff --git a/nb/HuggingFace Course-Qwen3_8B_FP8_GRPO.ipynb b/nb/HuggingFace Course-Qwen3_8B_FP8_GRPO.ipynb index 4492acd0..24fb30f9 100644 --- a/nb/HuggingFace Course-Qwen3_8B_FP8_GRPO.ipynb +++ b/nb/HuggingFace Course-Qwen3_8B_FP8_GRPO.ipynb @@ -1412,7 +1412,7 @@ " learning_rate = 2e-4, # Reduce to 2e-5 for long training runs\n", " logging_steps = 5,\n", " optim = \"adamw_8bit\",\n", - " weight_decay = 0.01,\n", + " weight_decay = 0.001,\n", " lr_scheduler_type = \"linear\",\n", " seed = 3407,\n", " report_to = \"none\", # Use this for WandB etc\n", @@ -2617,7 +2617,7 @@ " vllm_sampling_params = vllm_sampling_params,\n", " temperature = 1.0,\n", " learning_rate = 5e-6,\n", - " weight_decay = 0.01,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"linear\",\n", " optim = \"adamw_8bit\",\n", diff --git a/nb/HuggingFace Course-Qwen3_VL_(8B)-Vision-GRPO.ipynb b/nb/HuggingFace Course-Qwen3_VL_(8B)-Vision-GRPO.ipynb index 752d4f96..cf795aff 100644 --- a/nb/HuggingFace Course-Qwen3_VL_(8B)-Vision-GRPO.ipynb +++ b/nb/HuggingFace Course-Qwen3_VL_(8B)-Vision-GRPO.ipynb @@ -1351,7 +1351,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"adamw_8bit\",\n", diff --git a/nb/HuggingFace Course-gpt_oss_(20B)_Reinforcement_Learning_GRPO_Minesweeper_Game_BF16.ipynb b/nb/HuggingFace Course-gpt_oss_(20B)_Reinforcement_Learning_GRPO_Minesweeper_Game_BF16.ipynb index f9cdb301..4bc81645 100644 --- a/nb/HuggingFace Course-gpt_oss_(20B)_Reinforcement_Learning_GRPO_Minesweeper_Game_BF16.ipynb +++ b/nb/HuggingFace Course-gpt_oss_(20B)_Reinforcement_Learning_GRPO_Minesweeper_Game_BF16.ipynb @@ -838,7 +838,7 @@ "training_args = GRPOConfig(\n", " temperature = 1.0,\n", " learning_rate = 5e-5,\n", - " weight_decay = 0.01,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"linear\",\n", " optim = \"adamw_8bit\",\n", diff --git a/nb/Kaggle-Advanced_Llama3_1_(3B)_GRPO_LoRA.ipynb b/nb/Kaggle-Advanced_Llama3_1_(3B)_GRPO_LoRA.ipynb index e98c9a30..ed56a653 100644 --- a/nb/Kaggle-Advanced_Llama3_1_(3B)_GRPO_LoRA.ipynb +++ b/nb/Kaggle-Advanced_Llama3_1_(3B)_GRPO_LoRA.ipynb @@ -821,7 +821,7 @@ "from trl import GRPOConfig, GRPOTrainer\n", "training_args = GRPOConfig(\n", " learning_rate = 5e-6,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"adamw_torch_fused\",\n", diff --git a/nb/Kaggle-Advanced_Llama3_2_(3B)_GRPO_LoRA.ipynb b/nb/Kaggle-Advanced_Llama3_2_(3B)_GRPO_LoRA.ipynb index a9d7067a..87a88444 100644 --- a/nb/Kaggle-Advanced_Llama3_2_(3B)_GRPO_LoRA.ipynb +++ b/nb/Kaggle-Advanced_Llama3_2_(3B)_GRPO_LoRA.ipynb @@ -1261,7 +1261,7 @@ "from trl import GRPOConfig, GRPOTrainer\n", "training_args = GRPOConfig(\n", " learning_rate = 5e-6,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"adamw_8bit\",\n", diff --git a/nb/Kaggle-Falcon_H1_(0.5B)-Alpaca.ipynb b/nb/Kaggle-Falcon_H1_(0.5B)-Alpaca.ipynb index 272f2ab0..5e4c995b 100644 --- a/nb/Kaggle-Falcon_H1_(0.5B)-Alpaca.ipynb +++ b/nb/Kaggle-Falcon_H1_(0.5B)-Alpaca.ipynb @@ -419,7 +419,7 @@ " learning_rate = 2e-4,\n", " logging_steps = 1,\n", " optim = \"adamw_8bit\",\n", - " weight_decay = 0.01,\n", + " weight_decay = 0.001,\n", " lr_scheduler_type = \"linear\",\n", " seed = 3407,\n", " output_dir = \"outputs\",\n", diff --git a/nb/Kaggle-Gemma2_(2B)-Alpaca.ipynb b/nb/Kaggle-Gemma2_(2B)-Alpaca.ipynb index ee681e07..31281cc6 100644 --- a/nb/Kaggle-Gemma2_(2B)-Alpaca.ipynb +++ b/nb/Kaggle-Gemma2_(2B)-Alpaca.ipynb @@ -563,7 +563,7 @@ " learning_rate = 2e-4,\n", " logging_steps = 1,\n", " optim = \"adamw_8bit\",\n", - " weight_decay = 0.01,\n", + " weight_decay = 0.001,\n", " lr_scheduler_type = \"linear\",\n", " seed = 3407,\n", " output_dir = \"outputs\",\n", diff --git a/nb/Kaggle-Gemma3_(1B)-GRPO.ipynb b/nb/Kaggle-Gemma3_(1B)-GRPO.ipynb index bef4b60d..92e723bb 100644 --- a/nb/Kaggle-Gemma3_(1B)-GRPO.ipynb +++ b/nb/Kaggle-Gemma3_(1B)-GRPO.ipynb @@ -1069,7 +1069,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"adamw_torch_fused\",\n", diff --git a/nb/Kaggle-Gemma3_(4B)-Vision-GRPO.ipynb b/nb/Kaggle-Gemma3_(4B)-Vision-GRPO.ipynb index 45f73345..134d66e9 100644 --- a/nb/Kaggle-Gemma3_(4B)-Vision-GRPO.ipynb +++ b/nb/Kaggle-Gemma3_(4B)-Vision-GRPO.ipynb @@ -1142,7 +1142,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"adamw_8bit\",\n", diff --git a/nb/Kaggle-Llama3.1_(8B)-GRPO.ipynb b/nb/Kaggle-Llama3.1_(8B)-GRPO.ipynb index ea796dd7..6ebffb9f 100644 --- a/nb/Kaggle-Llama3.1_(8B)-GRPO.ipynb +++ b/nb/Kaggle-Llama3.1_(8B)-GRPO.ipynb @@ -760,7 +760,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"paged_adamw_8bit\",\n", diff --git a/nb/Kaggle-Llama_FP8_GRPO.ipynb b/nb/Kaggle-Llama_FP8_GRPO.ipynb index 6d3b324f..657e09c4 100644 --- a/nb/Kaggle-Llama_FP8_GRPO.ipynb +++ b/nb/Kaggle-Llama_FP8_GRPO.ipynb @@ -1298,7 +1298,7 @@ " learning_rate = 2e-4, # Reduce to 2e-5 for long training runs\n", " logging_steps = 5,\n", " optim = \"adamw_8bit\",\n", - " weight_decay = 0.01,\n", + " weight_decay = 0.001,\n", " lr_scheduler_type = \"linear\",\n", " seed = 3407,\n", " report_to = \"none\", # Use this for WandB etc\n", @@ -2513,7 +2513,7 @@ " vllm_sampling_params = vllm_sampling_params,\n", " temperature = 1.0,\n", " learning_rate = 5e-6,\n", - " weight_decay = 0.01,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"linear\",\n", " optim = \"adamw_8bit\",\n", diff --git a/nb/Kaggle-Mistral_v0.3_(7B)-GRPO.ipynb b/nb/Kaggle-Mistral_v0.3_(7B)-GRPO.ipynb index 13225662..db9f1a9b 100644 --- a/nb/Kaggle-Mistral_v0.3_(7B)-GRPO.ipynb +++ b/nb/Kaggle-Mistral_v0.3_(7B)-GRPO.ipynb @@ -604,7 +604,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"paged_adamw_8bit\",\n", diff --git a/nb/Kaggle-Phi_4_(14B)-GRPO.ipynb b/nb/Kaggle-Phi_4_(14B)-GRPO.ipynb index 333f2efe..dd82a2fd 100644 --- a/nb/Kaggle-Phi_4_(14B)-GRPO.ipynb +++ b/nb/Kaggle-Phi_4_(14B)-GRPO.ipynb @@ -382,7 +382,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"paged_adamw_8bit\",\n", diff --git a/nb/Kaggle-Qwen2.5_(3B)-GRPO.ipynb b/nb/Kaggle-Qwen2.5_(3B)-GRPO.ipynb index 95a7b129..39357ea8 100644 --- a/nb/Kaggle-Qwen2.5_(3B)-GRPO.ipynb +++ b/nb/Kaggle-Qwen2.5_(3B)-GRPO.ipynb @@ -899,7 +899,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"adamw_8bit\",\n", diff --git a/nb/Kaggle-Qwen2_5_7B_VL_GRPO.ipynb b/nb/Kaggle-Qwen2_5_7B_VL_GRPO.ipynb index b1a8d904..66f05958 100644 --- a/nb/Kaggle-Qwen2_5_7B_VL_GRPO.ipynb +++ b/nb/Kaggle-Qwen2_5_7B_VL_GRPO.ipynb @@ -1393,7 +1393,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"adamw_8bit\",\n", diff --git a/nb/Kaggle-Qwen3_8B_FP8_GRPO.ipynb b/nb/Kaggle-Qwen3_8B_FP8_GRPO.ipynb index 213d1dd3..d256cff0 100644 --- a/nb/Kaggle-Qwen3_8B_FP8_GRPO.ipynb +++ b/nb/Kaggle-Qwen3_8B_FP8_GRPO.ipynb @@ -1403,7 +1403,7 @@ " learning_rate = 2e-4, # Reduce to 2e-5 for long training runs\n", " logging_steps = 5,\n", " optim = \"adamw_8bit\",\n", - " weight_decay = 0.01,\n", + " weight_decay = 0.001,\n", " lr_scheduler_type = \"linear\",\n", " seed = 3407,\n", " report_to = \"none\", # Use this for WandB etc\n", @@ -2610,7 +2610,7 @@ " vllm_sampling_params = vllm_sampling_params,\n", " temperature = 1.0,\n", " learning_rate = 5e-6,\n", - " weight_decay = 0.01,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"linear\",\n", " optim = \"adamw_8bit\",\n", diff --git a/nb/Kaggle-Qwen3_VL_(8B)-Vision-GRPO.ipynb b/nb/Kaggle-Qwen3_VL_(8B)-Vision-GRPO.ipynb index eb11ea5e..2d2cf932 100644 --- a/nb/Kaggle-Qwen3_VL_(8B)-Vision-GRPO.ipynb +++ b/nb/Kaggle-Qwen3_VL_(8B)-Vision-GRPO.ipynb @@ -1342,7 +1342,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"adamw_8bit\",\n", diff --git a/nb/Kaggle-TinyLlama_(1.1B)-Alpaca.ipynb b/nb/Kaggle-TinyLlama_(1.1B)-Alpaca.ipynb index fadc8c64..0b0da122 100644 --- a/nb/Kaggle-TinyLlama_(1.1B)-Alpaca.ipynb +++ b/nb/Kaggle-TinyLlama_(1.1B)-Alpaca.ipynb @@ -614,7 +614,7 @@ " learning_rate = 2e-5,\n", " logging_steps = 1,\n", " optim = \"adamw_8bit\",\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " lr_scheduler_type = \"linear\",\n", " seed = 3407,\n", " output_dir = \"outputs\",\n", diff --git a/nb/Kaggle-gpt_oss_(20B)_Reinforcement_Learning_GRPO_Minesweeper_Game_BF16.ipynb b/nb/Kaggle-gpt_oss_(20B)_Reinforcement_Learning_GRPO_Minesweeper_Game_BF16.ipynb index f9cdb301..4bc81645 100644 --- a/nb/Kaggle-gpt_oss_(20B)_Reinforcement_Learning_GRPO_Minesweeper_Game_BF16.ipynb +++ b/nb/Kaggle-gpt_oss_(20B)_Reinforcement_Learning_GRPO_Minesweeper_Game_BF16.ipynb @@ -838,7 +838,7 @@ "training_args = GRPOConfig(\n", " temperature = 1.0,\n", " learning_rate = 5e-5,\n", - " weight_decay = 0.01,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"linear\",\n", " optim = \"adamw_8bit\",\n", diff --git a/nb/Llama3.1_(8B)-GRPO.ipynb b/nb/Llama3.1_(8B)-GRPO.ipynb index 31d2ca32..2488c3d7 100644 --- a/nb/Llama3.1_(8B)-GRPO.ipynb +++ b/nb/Llama3.1_(8B)-GRPO.ipynb @@ -767,7 +767,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"paged_adamw_8bit\",\n", diff --git a/nb/Llama_FP8_GRPO.ipynb b/nb/Llama_FP8_GRPO.ipynb index 512bb9c0..e0b9988a 100644 --- a/nb/Llama_FP8_GRPO.ipynb +++ b/nb/Llama_FP8_GRPO.ipynb @@ -1305,7 +1305,7 @@ " learning_rate = 2e-4, # Reduce to 2e-5 for long training runs\n", " logging_steps = 5,\n", " optim = \"adamw_8bit\",\n", - " weight_decay = 0.01,\n", + " weight_decay = 0.001,\n", " lr_scheduler_type = \"linear\",\n", " seed = 3407,\n", " report_to = \"none\", # Use this for WandB etc\n", @@ -2520,7 +2520,7 @@ " vllm_sampling_params = vllm_sampling_params,\n", " temperature = 1.0,\n", " learning_rate = 5e-6,\n", - " weight_decay = 0.01,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"linear\",\n", " optim = \"adamw_8bit\",\n", diff --git a/nb/Mistral_v0.3_(7B)-GRPO.ipynb b/nb/Mistral_v0.3_(7B)-GRPO.ipynb index bb3b9fdd..c85cead8 100644 --- a/nb/Mistral_v0.3_(7B)-GRPO.ipynb +++ b/nb/Mistral_v0.3_(7B)-GRPO.ipynb @@ -611,7 +611,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"paged_adamw_8bit\",\n", diff --git a/nb/Phi_4_(14B)-GRPO.ipynb b/nb/Phi_4_(14B)-GRPO.ipynb index 9a44a9b3..3e73adb4 100644 --- a/nb/Phi_4_(14B)-GRPO.ipynb +++ b/nb/Phi_4_(14B)-GRPO.ipynb @@ -389,7 +389,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"paged_adamw_8bit\",\n", diff --git a/nb/Qwen2.5_(3B)-GRPO.ipynb b/nb/Qwen2.5_(3B)-GRPO.ipynb index 442f1c58..1cf9cb60 100644 --- a/nb/Qwen2.5_(3B)-GRPO.ipynb +++ b/nb/Qwen2.5_(3B)-GRPO.ipynb @@ -906,7 +906,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"adamw_8bit\",\n", diff --git a/nb/Qwen2_5_7B_VL_GRPO.ipynb b/nb/Qwen2_5_7B_VL_GRPO.ipynb index 166ddb77..d35c4c14 100644 --- a/nb/Qwen2_5_7B_VL_GRPO.ipynb +++ b/nb/Qwen2_5_7B_VL_GRPO.ipynb @@ -1401,7 +1401,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"adamw_8bit\",\n", diff --git a/nb/Qwen3_8B_FP8_GRPO.ipynb b/nb/Qwen3_8B_FP8_GRPO.ipynb index e93294b5..8d14d7a3 100644 --- a/nb/Qwen3_8B_FP8_GRPO.ipynb +++ b/nb/Qwen3_8B_FP8_GRPO.ipynb @@ -1411,7 +1411,7 @@ " learning_rate = 2e-4, # Reduce to 2e-5 for long training runs\n", " logging_steps = 5,\n", " optim = \"adamw_8bit\",\n", - " weight_decay = 0.01,\n", + " weight_decay = 0.001,\n", " lr_scheduler_type = \"linear\",\n", " seed = 3407,\n", " report_to = \"none\", # Use this for WandB etc\n", @@ -2618,7 +2618,7 @@ " vllm_sampling_params = vllm_sampling_params,\n", " temperature = 1.0,\n", " learning_rate = 5e-6,\n", - " weight_decay = 0.01,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"linear\",\n", " optim = \"adamw_8bit\",\n", diff --git a/nb/TinyLlama_(1.1B)-Alpaca.ipynb b/nb/TinyLlama_(1.1B)-Alpaca.ipynb index bb2d1df1..da0707e2 100644 --- a/nb/TinyLlama_(1.1B)-Alpaca.ipynb +++ b/nb/TinyLlama_(1.1B)-Alpaca.ipynb @@ -614,7 +614,7 @@ " learning_rate = 2e-5,\n", " logging_steps = 1,\n", " optim = \"adamw_8bit\",\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " lr_scheduler_type = \"linear\",\n", " seed = 3407,\n", " output_dir = \"outputs\",\n", diff --git a/nb/gpt_oss_(20B)_Reinforcement_Learning_GRPO_Minesweeper_Game_BF16.ipynb b/nb/gpt_oss_(20B)_Reinforcement_Learning_GRPO_Minesweeper_Game_BF16.ipynb index f9cdb301..4bc81645 100644 --- a/nb/gpt_oss_(20B)_Reinforcement_Learning_GRPO_Minesweeper_Game_BF16.ipynb +++ b/nb/gpt_oss_(20B)_Reinforcement_Learning_GRPO_Minesweeper_Game_BF16.ipynb @@ -838,7 +838,7 @@ "training_args = GRPOConfig(\n", " temperature = 1.0,\n", " learning_rate = 5e-5,\n", - " weight_decay = 0.01,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"linear\",\n", " optim = \"adamw_8bit\",\n", diff --git a/original_template/Advanced_Llama3_1_(3B)_GRPO_LoRA.ipynb b/original_template/Advanced_Llama3_1_(3B)_GRPO_LoRA.ipynb index b2a7b2d7..99ddd45a 100644 --- a/original_template/Advanced_Llama3_1_(3B)_GRPO_LoRA.ipynb +++ b/original_template/Advanced_Llama3_1_(3B)_GRPO_LoRA.ipynb @@ -805,7 +805,7 @@ "from trl import GRPOConfig, GRPOTrainer\n", "training_args = GRPOConfig(\n", " learning_rate = 5e-6,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"adamw_torch_fused\",\n", diff --git a/original_template/Advanced_Llama3_2_(3B)_GRPO_LoRA.ipynb b/original_template/Advanced_Llama3_2_(3B)_GRPO_LoRA.ipynb index b1b0709c..ca952e47 100644 --- a/original_template/Advanced_Llama3_2_(3B)_GRPO_LoRA.ipynb +++ b/original_template/Advanced_Llama3_2_(3B)_GRPO_LoRA.ipynb @@ -1287,7 +1287,7 @@ "from trl import GRPOConfig, GRPOTrainer\n", "training_args = GRPOConfig(\n", " learning_rate = 5e-6,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"adamw_8bit\",\n", diff --git a/original_template/Falcon_H1_(0.5B)-Alpaca.ipynb b/original_template/Falcon_H1_(0.5B)-Alpaca.ipynb index 7db57aa1..3e2c7147 100644 --- a/original_template/Falcon_H1_(0.5B)-Alpaca.ipynb +++ b/original_template/Falcon_H1_(0.5B)-Alpaca.ipynb @@ -408,7 +408,7 @@ " learning_rate=2e-4,\n", " logging_steps=1,\n", " optim=\"adamw_8bit\",\n", - " weight_decay=0.01,\n", + " weight_decay=0.001,\n", " lr_scheduler_type=\"linear\",\n", " seed=3407,\n", " output_dir=\"outputs\",\n", diff --git a/original_template/Gemma2_(2B)-Alpaca.ipynb b/original_template/Gemma2_(2B)-Alpaca.ipynb index e5322132..67521ccb 100644 --- a/original_template/Gemma2_(2B)-Alpaca.ipynb +++ b/original_template/Gemma2_(2B)-Alpaca.ipynb @@ -536,7 +536,7 @@ " learning_rate=2e-4,\n", " logging_steps=1,\n", " optim=\"adamw_8bit\",\n", - " weight_decay=0.01,\n", + " weight_decay=0.001,\n", " lr_scheduler_type=\"linear\",\n", " seed=3407,\n", " output_dir=\"outputs\",\n", diff --git a/original_template/Gemma3_(1B)-GRPO.ipynb b/original_template/Gemma3_(1B)-GRPO.ipynb index df3b895d..8eac68d5 100644 --- a/original_template/Gemma3_(1B)-GRPO.ipynb +++ b/original_template/Gemma3_(1B)-GRPO.ipynb @@ -1051,7 +1051,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"adamw_torch_fused\",\n", diff --git a/original_template/Gemma3_(4B)-Vision-GRPO.ipynb b/original_template/Gemma3_(4B)-Vision-GRPO.ipynb index bbde825f..e7bf6a7a 100644 --- a/original_template/Gemma3_(4B)-Vision-GRPO.ipynb +++ b/original_template/Gemma3_(4B)-Vision-GRPO.ipynb @@ -1125,7 +1125,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"adamw_8bit\",\n", diff --git a/original_template/Llama3.1_(8B)-GRPO.ipynb b/original_template/Llama3.1_(8B)-GRPO.ipynb index dacb80e5..6eab7ca0 100644 --- a/original_template/Llama3.1_(8B)-GRPO.ipynb +++ b/original_template/Llama3.1_(8B)-GRPO.ipynb @@ -742,7 +742,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"paged_adamw_8bit\",\n", diff --git a/original_template/Llama_FP8_GRPO.ipynb b/original_template/Llama_FP8_GRPO.ipynb index f4aa1f3e..db07ab79 100644 --- a/original_template/Llama_FP8_GRPO.ipynb +++ b/original_template/Llama_FP8_GRPO.ipynb @@ -1280,7 +1280,7 @@ " learning_rate = 2e-4, # Reduce to 2e-5 for long training runs\n", " logging_steps = 5,\n", " optim = \"adamw_8bit\",\n", - " weight_decay = 0.01,\n", + " weight_decay = 0.001,\n", " lr_scheduler_type = \"linear\",\n", " seed = 3407,\n", " report_to = \"none\", # Use this for WandB etc\n", @@ -2495,7 +2495,7 @@ " vllm_sampling_params = vllm_sampling_params,\n", " temperature = 1.0,\n", " learning_rate = 5e-6,\n", - " weight_decay = 0.01,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"linear\",\n", " optim = \"adamw_8bit\",\n", diff --git a/original_template/Mistral_v0.3_(7B)-GRPO.ipynb b/original_template/Mistral_v0.3_(7B)-GRPO.ipynb index c6c773b5..d9a6c4c9 100644 --- a/original_template/Mistral_v0.3_(7B)-GRPO.ipynb +++ b/original_template/Mistral_v0.3_(7B)-GRPO.ipynb @@ -586,7 +586,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"paged_adamw_8bit\",\n", diff --git a/original_template/Phi_4_(14B)-GRPO.ipynb b/original_template/Phi_4_(14B)-GRPO.ipynb index e7f20c7e..16b4523e 100644 --- a/original_template/Phi_4_(14B)-GRPO.ipynb +++ b/original_template/Phi_4_(14B)-GRPO.ipynb @@ -364,7 +364,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"paged_adamw_8bit\",\n", diff --git a/original_template/Qwen2.5_(3B)-GRPO.ipynb b/original_template/Qwen2.5_(3B)-GRPO.ipynb index 185f4cd8..5120b42d 100644 --- a/original_template/Qwen2.5_(3B)-GRPO.ipynb +++ b/original_template/Qwen2.5_(3B)-GRPO.ipynb @@ -881,7 +881,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"adamw_8bit\",\n", diff --git a/original_template/Qwen2_5_7B_VL_GRPO.ipynb b/original_template/Qwen2_5_7B_VL_GRPO.ipynb index d42ac874..68516dec 100644 --- a/original_template/Qwen2_5_7B_VL_GRPO.ipynb +++ b/original_template/Qwen2_5_7B_VL_GRPO.ipynb @@ -1376,7 +1376,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"adamw_8bit\",\n", diff --git a/original_template/Qwen3_8B_FP8_GRPO.ipynb b/original_template/Qwen3_8B_FP8_GRPO.ipynb index 2be098fe..197bcfce 100644 --- a/original_template/Qwen3_8B_FP8_GRPO.ipynb +++ b/original_template/Qwen3_8B_FP8_GRPO.ipynb @@ -1386,7 +1386,7 @@ " learning_rate = 2e-4, # Reduce to 2e-5 for long training runs\n", " logging_steps = 5,\n", " optim = \"adamw_8bit\",\n", - " weight_decay = 0.01,\n", + " weight_decay = 0.001,\n", " lr_scheduler_type = \"linear\",\n", " seed = 3407,\n", " report_to = \"none\", # Use this for WandB etc\n", @@ -2593,7 +2593,7 @@ " vllm_sampling_params = vllm_sampling_params,\n", " temperature = 1.0,\n", " learning_rate = 5e-6,\n", - " weight_decay = 0.01,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"linear\",\n", " optim = \"adamw_8bit\",\n", diff --git a/original_template/Qwen3_VL_(8B)-Vision-GRPO.ipynb b/original_template/Qwen3_VL_(8B)-Vision-GRPO.ipynb index 292e2bbd..93a27ed3 100644 --- a/original_template/Qwen3_VL_(8B)-Vision-GRPO.ipynb +++ b/original_template/Qwen3_VL_(8B)-Vision-GRPO.ipynb @@ -1325,7 +1325,7 @@ " learning_rate = 5e-6,\n", " adam_beta1 = 0.9,\n", " adam_beta2 = 0.99,\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"cosine\",\n", " optim = \"adamw_8bit\",\n", diff --git a/original_template/TinyLlama_(1.1B)-Alpaca.ipynb b/original_template/TinyLlama_(1.1B)-Alpaca.ipynb index 0a2f2953..4502115c 100644 --- a/original_template/TinyLlama_(1.1B)-Alpaca.ipynb +++ b/original_template/TinyLlama_(1.1B)-Alpaca.ipynb @@ -587,7 +587,7 @@ " learning_rate = 2e-5,\n", " logging_steps = 1,\n", " optim = \"adamw_8bit\",\n", - " weight_decay = 0.1,\n", + " weight_decay = 0.001,\n", " lr_scheduler_type = \"linear\",\n", " seed = 3407,\n", " output_dir = \"outputs\",\n", diff --git a/original_template/gpt_oss_(20B)_Reinforcement_Learning_GRPO_Minesweeper_Game_BF16.ipynb b/original_template/gpt_oss_(20B)_Reinforcement_Learning_GRPO_Minesweeper_Game_BF16.ipynb index 9b519ca0..9ae3510f 100644 --- a/original_template/gpt_oss_(20B)_Reinforcement_Learning_GRPO_Minesweeper_Game_BF16.ipynb +++ b/original_template/gpt_oss_(20B)_Reinforcement_Learning_GRPO_Minesweeper_Game_BF16.ipynb @@ -838,7 +838,7 @@ "training_args = GRPOConfig(\n", " temperature = 1.0,\n", " learning_rate = 5e-5,\n", - " weight_decay = 0.01,\n", + " weight_decay = 0.001,\n", " warmup_ratio = 0.1,\n", " lr_scheduler_type = \"linear\",\n", " optim = \"adamw_8bit\",\n", diff --git a/python_scripts/Advanced_Llama3_2_(3B)_GRPO_LoRA.py b/python_scripts/Advanced_Llama3_2_(3B)_GRPO_LoRA.py index 6ca0a036..d8935ea3 100644 --- a/python_scripts/Advanced_Llama3_2_(3B)_GRPO_LoRA.py +++ b/python_scripts/Advanced_Llama3_2_(3B)_GRPO_LoRA.py @@ -352,7 +352,7 @@ def check_numbers(prompts, completions, answer, **kwargs): from trl import GRPOConfig, GRPOTrainer training_args = GRPOConfig( learning_rate = 5e-6, - weight_decay = 0.1, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "cosine", optim = "adamw_8bit", diff --git a/python_scripts/Falcon_H1_(0.5B)-Alpaca.py b/python_scripts/Falcon_H1_(0.5B)-Alpaca.py index 637b35e5..af2a6f40 100644 --- a/python_scripts/Falcon_H1_(0.5B)-Alpaca.py +++ b/python_scripts/Falcon_H1_(0.5B)-Alpaca.py @@ -176,7 +176,7 @@ def formatting_prompts_func(examples): learning_rate = 2e-4, logging_steps = 1, optim = "adamw_8bit", - weight_decay = 0.01, + weight_decay = 0.001, lr_scheduler_type = "linear", seed = 3407, output_dir = "outputs", diff --git a/python_scripts/Gemma2_(2B)-Alpaca.py b/python_scripts/Gemma2_(2B)-Alpaca.py index 7bc95cce..d577200d 100644 --- a/python_scripts/Gemma2_(2B)-Alpaca.py +++ b/python_scripts/Gemma2_(2B)-Alpaca.py @@ -166,7 +166,7 @@ def formatting_prompts_func(examples): learning_rate = 2e-4, logging_steps = 1, optim = "adamw_8bit", - weight_decay = 0.01, + weight_decay = 0.001, lr_scheduler_type = "linear", seed = 3407, output_dir = "outputs", diff --git a/python_scripts/Gemma3_(1B)-GRPO.py b/python_scripts/Gemma3_(1B)-GRPO.py index 6763ba50..06881b13 100644 --- a/python_scripts/Gemma3_(1B)-GRPO.py +++ b/python_scripts/Gemma3_(1B)-GRPO.py @@ -348,7 +348,7 @@ def check_numbers(prompts, completions, answer, **kwargs): learning_rate = 5e-6, adam_beta1 = 0.9, adam_beta2 = 0.99, - weight_decay = 0.1, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "cosine", optim = "adamw_torch_fused", diff --git a/python_scripts/Gemma3_(4B)-Vision-GRPO.py b/python_scripts/Gemma3_(4B)-Vision-GRPO.py index a7df8d58..5074ff7a 100644 --- a/python_scripts/Gemma3_(4B)-Vision-GRPO.py +++ b/python_scripts/Gemma3_(4B)-Vision-GRPO.py @@ -312,7 +312,7 @@ def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[floa learning_rate = 5e-6, adam_beta1 = 0.9, adam_beta2 = 0.99, - weight_decay = 0.1, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "cosine", optim = "adamw_8bit", diff --git a/python_scripts/HuggingFace Course-Advanced_Llama3_1_(3B)_GRPO_LoRA.py b/python_scripts/HuggingFace Course-Advanced_Llama3_1_(3B)_GRPO_LoRA.py index 05efe9a8..a1554858 100644 --- a/python_scripts/HuggingFace Course-Advanced_Llama3_1_(3B)_GRPO_LoRA.py +++ b/python_scripts/HuggingFace Course-Advanced_Llama3_1_(3B)_GRPO_LoRA.py @@ -352,7 +352,7 @@ def check_numbers(prompts, completions, answer, **kwargs): from trl import GRPOConfig, GRPOTrainer training_args = GRPOConfig( learning_rate = 5e-6, - weight_decay = 0.1, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "cosine", optim = "adamw_torch_fused", diff --git a/python_scripts/HuggingFace Course-Advanced_Llama3_2_(3B)_GRPO_LoRA.py b/python_scripts/HuggingFace Course-Advanced_Llama3_2_(3B)_GRPO_LoRA.py index 6ca0a036..d8935ea3 100644 --- a/python_scripts/HuggingFace Course-Advanced_Llama3_2_(3B)_GRPO_LoRA.py +++ b/python_scripts/HuggingFace Course-Advanced_Llama3_2_(3B)_GRPO_LoRA.py @@ -352,7 +352,7 @@ def check_numbers(prompts, completions, answer, **kwargs): from trl import GRPOConfig, GRPOTrainer training_args = GRPOConfig( learning_rate = 5e-6, - weight_decay = 0.1, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "cosine", optim = "adamw_8bit", diff --git a/python_scripts/HuggingFace Course-Gemma3_(1B)-GRPO.py b/python_scripts/HuggingFace Course-Gemma3_(1B)-GRPO.py index 5af24a72..5484158b 100644 --- a/python_scripts/HuggingFace Course-Gemma3_(1B)-GRPO.py +++ b/python_scripts/HuggingFace Course-Gemma3_(1B)-GRPO.py @@ -350,7 +350,7 @@ def check_numbers(prompts, completions, answer, **kwargs): learning_rate = 5e-6, adam_beta1 = 0.9, adam_beta2 = 0.99, - weight_decay = 0.1, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "cosine", optim = "adamw_torch_fused", diff --git a/python_scripts/HuggingFace Course-Gemma3_(4B)-Vision-GRPO.py b/python_scripts/HuggingFace Course-Gemma3_(4B)-Vision-GRPO.py index 6fcfda06..460f97d4 100644 --- a/python_scripts/HuggingFace Course-Gemma3_(4B)-Vision-GRPO.py +++ b/python_scripts/HuggingFace Course-Gemma3_(4B)-Vision-GRPO.py @@ -314,7 +314,7 @@ def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[floa learning_rate = 5e-6, adam_beta1 = 0.9, adam_beta2 = 0.99, - weight_decay = 0.1, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "cosine", optim = "adamw_8bit", diff --git a/python_scripts/HuggingFace Course-Llama3.1_(8B)-GRPO.py b/python_scripts/HuggingFace Course-Llama3.1_(8B)-GRPO.py index bf0b8b4f..0a541979 100644 --- a/python_scripts/HuggingFace Course-Llama3.1_(8B)-GRPO.py +++ b/python_scripts/HuggingFace Course-Llama3.1_(8B)-GRPO.py @@ -212,7 +212,7 @@ def xmlcount_reward_func(completions, **kwargs) -> list[float]: learning_rate = 5e-6, adam_beta1 = 0.9, adam_beta2 = 0.99, - weight_decay = 0.1, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "cosine", optim = "paged_adamw_8bit", diff --git a/python_scripts/HuggingFace Course-Llama_FP8_GRPO.py b/python_scripts/HuggingFace Course-Llama_FP8_GRPO.py index 62bb1221..2514a3f6 100644 --- a/python_scripts/HuggingFace Course-Llama_FP8_GRPO.py +++ b/python_scripts/HuggingFace Course-Llama_FP8_GRPO.py @@ -308,7 +308,7 @@ def format_dataset(x): learning_rate = 2e-4, # Reduce to 2e-5 for long training runs logging_steps = 5, optim = "adamw_8bit", - weight_decay = 0.01, + weight_decay = 0.001, lr_scheduler_type = "linear", seed = 3407, report_to = "none", # Use this for WandB etc @@ -669,7 +669,7 @@ def check_numbers(prompts, completions, answer, **kwargs): vllm_sampling_params = vllm_sampling_params, temperature = 1.0, learning_rate = 5e-6, - weight_decay = 0.01, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "linear", optim = "adamw_8bit", diff --git a/python_scripts/HuggingFace Course-Mistral_v0.3_(7B)-GRPO.py b/python_scripts/HuggingFace Course-Mistral_v0.3_(7B)-GRPO.py index 5f3528de..2a828b8c 100644 --- a/python_scripts/HuggingFace Course-Mistral_v0.3_(7B)-GRPO.py +++ b/python_scripts/HuggingFace Course-Mistral_v0.3_(7B)-GRPO.py @@ -212,7 +212,7 @@ def xmlcount_reward_func(completions, **kwargs) -> list[float]: learning_rate = 5e-6, adam_beta1 = 0.9, adam_beta2 = 0.99, - weight_decay = 0.1, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "cosine", optim = "paged_adamw_8bit", diff --git a/python_scripts/HuggingFace Course-Phi_4_(14B)-GRPO.py b/python_scripts/HuggingFace Course-Phi_4_(14B)-GRPO.py index 882102f7..93cd2a42 100644 --- a/python_scripts/HuggingFace Course-Phi_4_(14B)-GRPO.py +++ b/python_scripts/HuggingFace Course-Phi_4_(14B)-GRPO.py @@ -208,7 +208,7 @@ def xmlcount_reward_func(completions, **kwargs) -> list[float]: learning_rate = 5e-6, adam_beta1 = 0.9, adam_beta2 = 0.99, - weight_decay = 0.1, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "cosine", optim = "paged_adamw_8bit", diff --git a/python_scripts/HuggingFace Course-Qwen2.5_(3B)-GRPO.py b/python_scripts/HuggingFace Course-Qwen2.5_(3B)-GRPO.py index d0ba14c0..90ea0e0c 100644 --- a/python_scripts/HuggingFace Course-Qwen2.5_(3B)-GRPO.py +++ b/python_scripts/HuggingFace Course-Qwen2.5_(3B)-GRPO.py @@ -211,7 +211,7 @@ def xmlcount_reward_func(completions, **kwargs) -> list[float]: learning_rate = 5e-6, adam_beta1 = 0.9, adam_beta2 = 0.99, - weight_decay = 0.1, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "cosine", optim = "adamw_8bit", diff --git a/python_scripts/HuggingFace Course-Qwen2_5_7B_VL_GRPO.py b/python_scripts/HuggingFace Course-Qwen2_5_7B_VL_GRPO.py index d14d9fa2..7af6dddb 100644 --- a/python_scripts/HuggingFace Course-Qwen2_5_7B_VL_GRPO.py +++ b/python_scripts/HuggingFace Course-Qwen2_5_7B_VL_GRPO.py @@ -329,7 +329,7 @@ def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[floa learning_rate = 5e-6, adam_beta1 = 0.9, adam_beta2 = 0.99, - weight_decay = 0.1, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "cosine", optim = "adamw_8bit", diff --git a/python_scripts/HuggingFace Course-Qwen3_8B_FP8_GRPO.py b/python_scripts/HuggingFace Course-Qwen3_8B_FP8_GRPO.py index de115cea..092041ba 100644 --- a/python_scripts/HuggingFace Course-Qwen3_8B_FP8_GRPO.py +++ b/python_scripts/HuggingFace Course-Qwen3_8B_FP8_GRPO.py @@ -311,7 +311,7 @@ def format_dataset(x): learning_rate = 2e-4, # Reduce to 2e-5 for long training runs logging_steps = 5, optim = "adamw_8bit", - weight_decay = 0.01, + weight_decay = 0.001, lr_scheduler_type = "linear", seed = 3407, report_to = "none", # Use this for WandB etc @@ -670,7 +670,7 @@ def check_numbers(prompts, completions, answer, **kwargs): vllm_sampling_params = vllm_sampling_params, temperature = 1.0, learning_rate = 5e-6, - weight_decay = 0.01, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "linear", optim = "adamw_8bit", diff --git a/python_scripts/HuggingFace Course-Qwen3_VL_(8B)-Vision-GRPO.py b/python_scripts/HuggingFace Course-Qwen3_VL_(8B)-Vision-GRPO.py index 4a1d1b49..5744d2de 100644 --- a/python_scripts/HuggingFace Course-Qwen3_VL_(8B)-Vision-GRPO.py +++ b/python_scripts/HuggingFace Course-Qwen3_VL_(8B)-Vision-GRPO.py @@ -334,7 +334,7 @@ def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[floa learning_rate = 5e-6, adam_beta1 = 0.9, adam_beta2 = 0.99, - weight_decay = 0.1, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "cosine", optim = "adamw_8bit", diff --git a/python_scripts/HuggingFace Course-gpt_oss_(20B)_Reinforcement_Learning_GRPO_Minesweeper_Game_BF16.py b/python_scripts/HuggingFace Course-gpt_oss_(20B)_Reinforcement_Learning_GRPO_Minesweeper_Game_BF16.py index d067d698..a17a6221 100644 --- a/python_scripts/HuggingFace Course-gpt_oss_(20B)_Reinforcement_Learning_GRPO_Minesweeper_Game_BF16.py +++ b/python_scripts/HuggingFace Course-gpt_oss_(20B)_Reinforcement_Learning_GRPO_Minesweeper_Game_BF16.py @@ -741,7 +741,7 @@ def generate_game_states(num_samples = 1000, rows = 6, cols = 6, num_mines = 5, training_args = GRPOConfig( temperature = 1.0, learning_rate = 5e-5, - weight_decay = 0.01, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "linear", optim = "adamw_8bit", diff --git a/python_scripts/Kaggle-Advanced_Llama3_1_(3B)_GRPO_LoRA.py b/python_scripts/Kaggle-Advanced_Llama3_1_(3B)_GRPO_LoRA.py index c50ae135..3b865ba5 100644 --- a/python_scripts/Kaggle-Advanced_Llama3_1_(3B)_GRPO_LoRA.py +++ b/python_scripts/Kaggle-Advanced_Llama3_1_(3B)_GRPO_LoRA.py @@ -327,7 +327,7 @@ def check_numbers(prompts, completions, answer, **kwargs): from trl import GRPOConfig, GRPOTrainer training_args = GRPOConfig( learning_rate = 5e-6, - weight_decay = 0.1, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "cosine", optim = "adamw_torch_fused", diff --git a/python_scripts/Kaggle-Advanced_Llama3_2_(3B)_GRPO_LoRA.py b/python_scripts/Kaggle-Advanced_Llama3_2_(3B)_GRPO_LoRA.py index 0ae3725c..83aab641 100644 --- a/python_scripts/Kaggle-Advanced_Llama3_2_(3B)_GRPO_LoRA.py +++ b/python_scripts/Kaggle-Advanced_Llama3_2_(3B)_GRPO_LoRA.py @@ -329,7 +329,7 @@ def check_numbers(prompts, completions, answer, **kwargs): from trl import GRPOConfig, GRPOTrainer training_args = GRPOConfig( learning_rate = 5e-6, - weight_decay = 0.1, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "cosine", optim = "adamw_8bit", diff --git a/python_scripts/Kaggle-Falcon_H1_(0.5B)-Alpaca.py b/python_scripts/Kaggle-Falcon_H1_(0.5B)-Alpaca.py index 4adb07ff..75f20929 100644 --- a/python_scripts/Kaggle-Falcon_H1_(0.5B)-Alpaca.py +++ b/python_scripts/Kaggle-Falcon_H1_(0.5B)-Alpaca.py @@ -176,7 +176,7 @@ def formatting_prompts_func(examples): learning_rate = 2e-4, logging_steps = 1, optim = "adamw_8bit", - weight_decay = 0.01, + weight_decay = 0.001, lr_scheduler_type = "linear", seed = 3407, output_dir = "outputs", diff --git a/python_scripts/Kaggle-Gemma2_(2B)-Alpaca.py b/python_scripts/Kaggle-Gemma2_(2B)-Alpaca.py index 90662b60..2adf77ff 100644 --- a/python_scripts/Kaggle-Gemma2_(2B)-Alpaca.py +++ b/python_scripts/Kaggle-Gemma2_(2B)-Alpaca.py @@ -166,7 +166,7 @@ def formatting_prompts_func(examples): learning_rate = 2e-4, logging_steps = 1, optim = "adamw_8bit", - weight_decay = 0.01, + weight_decay = 0.001, lr_scheduler_type = "linear", seed = 3407, output_dir = "outputs", diff --git a/python_scripts/Kaggle-Gemma3_(1B)-GRPO.py b/python_scripts/Kaggle-Gemma3_(1B)-GRPO.py index a71119c2..c36ffa6b 100644 --- a/python_scripts/Kaggle-Gemma3_(1B)-GRPO.py +++ b/python_scripts/Kaggle-Gemma3_(1B)-GRPO.py @@ -325,7 +325,7 @@ def check_numbers(prompts, completions, answer, **kwargs): learning_rate = 5e-6, adam_beta1 = 0.9, adam_beta2 = 0.99, - weight_decay = 0.1, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "cosine", optim = "adamw_torch_fused", diff --git a/python_scripts/Kaggle-Gemma3_(4B)-Vision-GRPO.py b/python_scripts/Kaggle-Gemma3_(4B)-Vision-GRPO.py index a42f086f..9c885808 100644 --- a/python_scripts/Kaggle-Gemma3_(4B)-Vision-GRPO.py +++ b/python_scripts/Kaggle-Gemma3_(4B)-Vision-GRPO.py @@ -289,7 +289,7 @@ def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[floa learning_rate = 5e-6, adam_beta1 = 0.9, adam_beta2 = 0.99, - weight_decay = 0.1, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "cosine", optim = "adamw_8bit", diff --git a/python_scripts/Kaggle-Llama3.1_(8B)-GRPO.py b/python_scripts/Kaggle-Llama3.1_(8B)-GRPO.py index 1f9d0b8e..4769a292 100644 --- a/python_scripts/Kaggle-Llama3.1_(8B)-GRPO.py +++ b/python_scripts/Kaggle-Llama3.1_(8B)-GRPO.py @@ -187,7 +187,7 @@ def xmlcount_reward_func(completions, **kwargs) -> list[float]: learning_rate = 5e-6, adam_beta1 = 0.9, adam_beta2 = 0.99, - weight_decay = 0.1, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "cosine", optim = "paged_adamw_8bit", diff --git a/python_scripts/Kaggle-Llama_FP8_GRPO.py b/python_scripts/Kaggle-Llama_FP8_GRPO.py index b31d380f..c852772c 100644 --- a/python_scripts/Kaggle-Llama_FP8_GRPO.py +++ b/python_scripts/Kaggle-Llama_FP8_GRPO.py @@ -283,7 +283,7 @@ def format_dataset(x): learning_rate = 2e-4, # Reduce to 2e-5 for long training runs logging_steps = 5, optim = "adamw_8bit", - weight_decay = 0.01, + weight_decay = 0.001, lr_scheduler_type = "linear", seed = 3407, report_to = "none", # Use this for WandB etc @@ -644,7 +644,7 @@ def check_numbers(prompts, completions, answer, **kwargs): vllm_sampling_params = vllm_sampling_params, temperature = 1.0, learning_rate = 5e-6, - weight_decay = 0.01, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "linear", optim = "adamw_8bit", diff --git a/python_scripts/Kaggle-Mistral_v0.3_(7B)-GRPO.py b/python_scripts/Kaggle-Mistral_v0.3_(7B)-GRPO.py index 02f5598a..30e8135a 100644 --- a/python_scripts/Kaggle-Mistral_v0.3_(7B)-GRPO.py +++ b/python_scripts/Kaggle-Mistral_v0.3_(7B)-GRPO.py @@ -187,7 +187,7 @@ def xmlcount_reward_func(completions, **kwargs) -> list[float]: learning_rate = 5e-6, adam_beta1 = 0.9, adam_beta2 = 0.99, - weight_decay = 0.1, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "cosine", optim = "paged_adamw_8bit", diff --git a/python_scripts/Kaggle-Phi_4_(14B)-GRPO.py b/python_scripts/Kaggle-Phi_4_(14B)-GRPO.py index 9ddc8e5a..512bad08 100644 --- a/python_scripts/Kaggle-Phi_4_(14B)-GRPO.py +++ b/python_scripts/Kaggle-Phi_4_(14B)-GRPO.py @@ -183,7 +183,7 @@ def xmlcount_reward_func(completions, **kwargs) -> list[float]: learning_rate = 5e-6, adam_beta1 = 0.9, adam_beta2 = 0.99, - weight_decay = 0.1, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "cosine", optim = "paged_adamw_8bit", diff --git a/python_scripts/Kaggle-Qwen2.5_(3B)-GRPO.py b/python_scripts/Kaggle-Qwen2.5_(3B)-GRPO.py index 43c090f9..bd5a8a21 100644 --- a/python_scripts/Kaggle-Qwen2.5_(3B)-GRPO.py +++ b/python_scripts/Kaggle-Qwen2.5_(3B)-GRPO.py @@ -186,7 +186,7 @@ def xmlcount_reward_func(completions, **kwargs) -> list[float]: learning_rate = 5e-6, adam_beta1 = 0.9, adam_beta2 = 0.99, - weight_decay = 0.1, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "cosine", optim = "adamw_8bit", diff --git a/python_scripts/Kaggle-Qwen2_5_7B_VL_GRPO.py b/python_scripts/Kaggle-Qwen2_5_7B_VL_GRPO.py index deb69ce0..5cf61cf3 100644 --- a/python_scripts/Kaggle-Qwen2_5_7B_VL_GRPO.py +++ b/python_scripts/Kaggle-Qwen2_5_7B_VL_GRPO.py @@ -304,7 +304,7 @@ def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[floa learning_rate = 5e-6, adam_beta1 = 0.9, adam_beta2 = 0.99, - weight_decay = 0.1, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "cosine", optim = "adamw_8bit", diff --git a/python_scripts/Kaggle-Qwen3_8B_FP8_GRPO.py b/python_scripts/Kaggle-Qwen3_8B_FP8_GRPO.py index cceb48f0..ccffa313 100644 --- a/python_scripts/Kaggle-Qwen3_8B_FP8_GRPO.py +++ b/python_scripts/Kaggle-Qwen3_8B_FP8_GRPO.py @@ -286,7 +286,7 @@ def format_dataset(x): learning_rate = 2e-4, # Reduce to 2e-5 for long training runs logging_steps = 5, optim = "adamw_8bit", - weight_decay = 0.01, + weight_decay = 0.001, lr_scheduler_type = "linear", seed = 3407, report_to = "none", # Use this for WandB etc @@ -645,7 +645,7 @@ def check_numbers(prompts, completions, answer, **kwargs): vllm_sampling_params = vllm_sampling_params, temperature = 1.0, learning_rate = 5e-6, - weight_decay = 0.01, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "linear", optim = "adamw_8bit", diff --git a/python_scripts/Kaggle-Qwen3_VL_(8B)-Vision-GRPO.py b/python_scripts/Kaggle-Qwen3_VL_(8B)-Vision-GRPO.py index ed5fd6e6..4876d1e0 100644 --- a/python_scripts/Kaggle-Qwen3_VL_(8B)-Vision-GRPO.py +++ b/python_scripts/Kaggle-Qwen3_VL_(8B)-Vision-GRPO.py @@ -309,7 +309,7 @@ def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[floa learning_rate = 5e-6, adam_beta1 = 0.9, adam_beta2 = 0.99, - weight_decay = 0.1, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "cosine", optim = "adamw_8bit", diff --git a/python_scripts/Kaggle-TinyLlama_(1.1B)-Alpaca.py b/python_scripts/Kaggle-TinyLlama_(1.1B)-Alpaca.py index a86d3ac5..e8b1bafa 100644 --- a/python_scripts/Kaggle-TinyLlama_(1.1B)-Alpaca.py +++ b/python_scripts/Kaggle-TinyLlama_(1.1B)-Alpaca.py @@ -161,7 +161,7 @@ def formatting_prompts_func(examples): learning_rate = 2e-5, logging_steps = 1, optim = "adamw_8bit", - weight_decay = 0.1, + weight_decay = 0.001, lr_scheduler_type = "linear", seed = 3407, output_dir = "outputs", diff --git a/python_scripts/Kaggle-gpt_oss_(20B)_Reinforcement_Learning_GRPO_Minesweeper_Game_BF16.py b/python_scripts/Kaggle-gpt_oss_(20B)_Reinforcement_Learning_GRPO_Minesweeper_Game_BF16.py index d067d698..a17a6221 100644 --- a/python_scripts/Kaggle-gpt_oss_(20B)_Reinforcement_Learning_GRPO_Minesweeper_Game_BF16.py +++ b/python_scripts/Kaggle-gpt_oss_(20B)_Reinforcement_Learning_GRPO_Minesweeper_Game_BF16.py @@ -741,7 +741,7 @@ def generate_game_states(num_samples = 1000, rows = 6, cols = 6, num_mines = 5, training_args = GRPOConfig( temperature = 1.0, learning_rate = 5e-5, - weight_decay = 0.01, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "linear", optim = "adamw_8bit", diff --git a/python_scripts/Llama3.1_(8B)-GRPO.py b/python_scripts/Llama3.1_(8B)-GRPO.py index d83e4a05..4ee74315 100644 --- a/python_scripts/Llama3.1_(8B)-GRPO.py +++ b/python_scripts/Llama3.1_(8B)-GRPO.py @@ -210,7 +210,7 @@ def xmlcount_reward_func(completions, **kwargs) -> list[float]: learning_rate = 5e-6, adam_beta1 = 0.9, adam_beta2 = 0.99, - weight_decay = 0.1, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "cosine", optim = "paged_adamw_8bit", diff --git a/python_scripts/Llama_FP8_GRPO.py b/python_scripts/Llama_FP8_GRPO.py index db6060d3..0511fabf 100644 --- a/python_scripts/Llama_FP8_GRPO.py +++ b/python_scripts/Llama_FP8_GRPO.py @@ -306,7 +306,7 @@ def format_dataset(x): learning_rate = 2e-4, # Reduce to 2e-5 for long training runs logging_steps = 5, optim = "adamw_8bit", - weight_decay = 0.01, + weight_decay = 0.001, lr_scheduler_type = "linear", seed = 3407, report_to = "none", # Use this for WandB etc @@ -667,7 +667,7 @@ def check_numbers(prompts, completions, answer, **kwargs): vllm_sampling_params = vllm_sampling_params, temperature = 1.0, learning_rate = 5e-6, - weight_decay = 0.01, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "linear", optim = "adamw_8bit", diff --git a/python_scripts/Mistral_v0.3_(7B)-GRPO.py b/python_scripts/Mistral_v0.3_(7B)-GRPO.py index b3ca59ac..d0196d07 100644 --- a/python_scripts/Mistral_v0.3_(7B)-GRPO.py +++ b/python_scripts/Mistral_v0.3_(7B)-GRPO.py @@ -210,7 +210,7 @@ def xmlcount_reward_func(completions, **kwargs) -> list[float]: learning_rate = 5e-6, adam_beta1 = 0.9, adam_beta2 = 0.99, - weight_decay = 0.1, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "cosine", optim = "paged_adamw_8bit", diff --git a/python_scripts/Phi_4_(14B)-GRPO.py b/python_scripts/Phi_4_(14B)-GRPO.py index 50301864..e39ca85c 100644 --- a/python_scripts/Phi_4_(14B)-GRPO.py +++ b/python_scripts/Phi_4_(14B)-GRPO.py @@ -206,7 +206,7 @@ def xmlcount_reward_func(completions, **kwargs) -> list[float]: learning_rate = 5e-6, adam_beta1 = 0.9, adam_beta2 = 0.99, - weight_decay = 0.1, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "cosine", optim = "paged_adamw_8bit", diff --git a/python_scripts/Qwen2.5_(3B)-GRPO.py b/python_scripts/Qwen2.5_(3B)-GRPO.py index d3d33926..2c58444d 100644 --- a/python_scripts/Qwen2.5_(3B)-GRPO.py +++ b/python_scripts/Qwen2.5_(3B)-GRPO.py @@ -209,7 +209,7 @@ def xmlcount_reward_func(completions, **kwargs) -> list[float]: learning_rate = 5e-6, adam_beta1 = 0.9, adam_beta2 = 0.99, - weight_decay = 0.1, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "cosine", optim = "adamw_8bit", diff --git a/python_scripts/Qwen2_5_7B_VL_GRPO.py b/python_scripts/Qwen2_5_7B_VL_GRPO.py index 2276ca5d..d9a267c9 100644 --- a/python_scripts/Qwen2_5_7B_VL_GRPO.py +++ b/python_scripts/Qwen2_5_7B_VL_GRPO.py @@ -327,7 +327,7 @@ def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[floa learning_rate = 5e-6, adam_beta1 = 0.9, adam_beta2 = 0.99, - weight_decay = 0.1, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "cosine", optim = "adamw_8bit", diff --git a/python_scripts/Qwen3_8B_FP8_GRPO.py b/python_scripts/Qwen3_8B_FP8_GRPO.py index 1a5b55ea..da44c4ea 100644 --- a/python_scripts/Qwen3_8B_FP8_GRPO.py +++ b/python_scripts/Qwen3_8B_FP8_GRPO.py @@ -309,7 +309,7 @@ def format_dataset(x): learning_rate = 2e-4, # Reduce to 2e-5 for long training runs logging_steps = 5, optim = "adamw_8bit", - weight_decay = 0.01, + weight_decay = 0.001, lr_scheduler_type = "linear", seed = 3407, report_to = "none", # Use this for WandB etc @@ -668,7 +668,7 @@ def check_numbers(prompts, completions, answer, **kwargs): vllm_sampling_params = vllm_sampling_params, temperature = 1.0, learning_rate = 5e-6, - weight_decay = 0.01, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "linear", optim = "adamw_8bit", diff --git a/python_scripts/TinyLlama_(1.1B)-Alpaca.py b/python_scripts/TinyLlama_(1.1B)-Alpaca.py index dc5ab2fc..e013ea6c 100644 --- a/python_scripts/TinyLlama_(1.1B)-Alpaca.py +++ b/python_scripts/TinyLlama_(1.1B)-Alpaca.py @@ -161,7 +161,7 @@ def formatting_prompts_func(examples): learning_rate = 2e-5, logging_steps = 1, optim = "adamw_8bit", - weight_decay = 0.1, + weight_decay = 0.001, lr_scheduler_type = "linear", seed = 3407, output_dir = "outputs", diff --git a/python_scripts/gpt_oss_(20B)_Reinforcement_Learning_GRPO_Minesweeper_Game_BF16.py b/python_scripts/gpt_oss_(20B)_Reinforcement_Learning_GRPO_Minesweeper_Game_BF16.py index d067d698..a17a6221 100644 --- a/python_scripts/gpt_oss_(20B)_Reinforcement_Learning_GRPO_Minesweeper_Game_BF16.py +++ b/python_scripts/gpt_oss_(20B)_Reinforcement_Learning_GRPO_Minesweeper_Game_BF16.py @@ -741,7 +741,7 @@ def generate_game_states(num_samples = 1000, rows = 6, cols = 6, num_mines = 5, training_args = GRPOConfig( temperature = 1.0, learning_rate = 5e-5, - weight_decay = 0.01, + weight_decay = 0.001, warmup_ratio = 0.1, lr_scheduler_type = "linear", optim = "adamw_8bit",