3838from fastdeploy .engine .common_engine import EngineService
3939from fastdeploy .engine .expert_service import start_data_parallel_service
4040from fastdeploy .engine .request import Request
41- from fastdeploy .input .preprocess import InputPreprocessor
4241from fastdeploy .inter_communicator import EngineWorkerQueue , IPCSignal
4342from fastdeploy .metrics .metrics import main_process_metrics
4443from fastdeploy .utils import EngineError , console_logger , envs , llm_logger
@@ -87,13 +86,6 @@ def __init__(self, cfg):
8786 self .running = True
8887 self .is_started = False
8988
90- self .input_processor = InputPreprocessor (
91- cfg .model_config ,
92- cfg .structured_outputs_config .reasoning_parser ,
93- cfg .limit_mm_per_prompt ,
94- cfg .mm_processor_kwargs ,
95- cfg .tool_parser ,
96- )
9789 self .engine = EngineService (cfg )
9890
9991 if self .cfg .cache_config .num_gpu_blocks_override is None :
@@ -117,12 +109,12 @@ def start(self, api_server_pid=None):
117109 self .ipc_signal_suffix = self .cfg .parallel_config .engine_worker_queue_port [0 ]
118110 self ._init_worker_signals ()
119111
120- self .data_processor = self .input_processor .create_processor ()
121- self .engine .data_processor = self .data_processor
122112 # Launch components: scheduler, cache_manager, expert_service et.al.
123113 self .launch_components ()
124114
125115 self .engine .start ()
116+ self .engine .create_data_processor ()
117+ self .data_processor = self .engine .data_processor
126118
127119 # If block numer is specified and model is deployed in mixed mode, start cache manager first
128120 if not self .do_profile and self .cfg .scheduler_config .splitwise_role != "mixed" :
@@ -246,7 +238,7 @@ def add_requests(self, task, sampling_params=None, **kwargs):
246238 chat_template_kwargs = kwargs .get ("chat_template_kwargs" ) or {}
247239 chat_template_kwargs ["chat_template" ] = kwargs .get ("chat_template" )
248240 kwargs ["chat_template_kwargs" ] = chat_template_kwargs
249- request = self .data_processor .process_request (request , self .cfg .model_config .max_model_len , ** kwargs )
241+ request = self .engine . data_processor .process_request (request , self .cfg .model_config .max_model_len , ** kwargs )
250242 request .prompt_token_ids_len = len (request .prompt_token_ids )
251243 request .need_prefill_tokens = request .prompt_token_ids_len
252244 input_ids_len = request .prompt_token_ids_len
@@ -482,9 +474,9 @@ def _start_worker_service(self):
482474 py_script = os .path .join (current_dir_path , worker_path )
483475
484476 ori_vocab_size = (
485- len (self .data_processor .tokenizer .sp_model )
486- if hasattr (self .data_processor .tokenizer , "sp_model" )
487- else len (self .data_processor .tokenizer .vocab )
477+ len (self .engine . data_processor .tokenizer .sp_model )
478+ if hasattr (self .engine . data_processor .tokenizer , "sp_model" )
479+ else len (self .engine . data_processor .tokenizer .vocab )
488480 )
489481
490482 think_end_id = self .data_processor .tokenizer .get_vocab ().get ("</think>" , - 1 )
@@ -511,8 +503,8 @@ def _start_worker_service(self):
511503 f" --total_block_num { self .cfg .cache_config .total_block_num } "
512504 f" --block_size { self .cfg .cache_config .block_size } "
513505 f" --enc_dec_block_num { self .cfg .cache_config .enc_dec_block_num } "
514- f" --eos_tokens_lens { self .data_processor .eos_token_id_len } "
515- f" --pad_token_id { self .data_processor .pad_token_id } "
506+ f" --eos_tokens_lens { self .engine . data_processor .eos_token_id_len } "
507+ f" --pad_token_id { self .engine . data_processor .pad_token_id } "
516508 f" --engine_pid { self .cfg .parallel_config .engine_worker_queue_port [0 ]} "
517509 f" --max_num_batched_tokens { self .cfg .scheduler_config .max_num_batched_tokens } "
518510 f" --splitwise_role { self .cfg .scheduler_config .splitwise_role } "
@@ -611,15 +603,15 @@ def generate(self, prompts, stream):
611603 for result in self ._get_generated_tokens (req_id ):
612604 is_end = result .finished
613605 if stream and not is_end :
614- processed = self .data_processor .process_response (result )
606+ processed = self .engine . data_processor .process_response (result )
615607 if processed is None :
616608 continue
617609 output = processed .to_dict ()
618610 yield output
619611
620612 # Exit loop if termination condition is met
621613 if is_end :
622- processed = self .data_processor .process_response (result )
614+ processed = self .engine . data_processor .process_response (result )
623615 output = processed .to_dict ()
624616 llm_logger .debug (f"Generate result: { output } " )
625617 if not stream :
0 commit comments