@@ -645,18 +645,16 @@ struct server_context {
645645
646646 // Clear any sampling context
647647 for (server_slot & slot : slots) {
648- if (slot.smpl != nullptr ) {
649- llama_free (slot.ctx_dft );
650- slot.ctx_dft = nullptr ;
648+ common_sampler_free (slot.smpl );
649+ slot.smpl = nullptr ;
651650
652- common_speculative_free (slot.spec );
653- slot.spec = nullptr ;
651+ llama_free (slot.ctx_dft );
652+ slot.ctx_dft = nullptr ;
654653
655- common_sampler_free (slot.smpl );
656- slot.smpl = nullptr ;
654+ common_speculative_free (slot.spec );
655+ slot.spec = nullptr ;
657656
658- llama_batch_free (slot.batch_spec );
659- }
657+ llama_batch_free (slot.batch_spec );
660658 }
661659
662660 llama_batch_free (batch);
@@ -688,15 +686,9 @@ struct server_context {
688686
689687 auto params_dft = params;
690688
691- params_dft.model = params.model_draft ;
689+ params_dft.model = params.model_draft ;
692690 params_dft.n_gpu_layers = params.n_gpu_layers_draft ;
693691
694- if (params.draft_cpuparams .n_threads > 0 ) {
695- params_dft.cpuparams .n_threads = params.draft_cpuparams .n_threads ;
696- }
697-
698- params_dft.cpuparams_batch .n_threads = params.draft_cpuparams_batch .n_threads ;
699-
700692 common_init_result llama_init_dft = common_init_from_params (params_dft);
701693
702694 model_dft = llama_init_dft.model ;
@@ -708,10 +700,15 @@ struct server_context {
708700
709701 if (!common_speculative_are_compatible (ctx, llama_init_dft.context )) {
710702 SRV_ERR (" the draft model '%s' is not compatible with the target model '%s'\n " , params.model_draft .c_str (), params.model .c_str ());
703+
704+ llama_free (llama_init_dft.context );
705+ llama_free_model (llama_init_dft.model );
706+
711707 return false ;
712708 }
713709
714710 cparams_dft = common_context_params_to_llama (params);
711+ cparams_dft.n_batch = llama_n_ctx (llama_init_dft.context );
715712
716713 // the context is not needed - we will create one for each slot
717714 llama_free (llama_init_dft.context );
0 commit comments