|
37 | 37 | from .llm_utils import (CachedModelLoader, KvCacheRetentionConfig, |
38 | 38 | LlmBuildStats, ModelLoader, _ModelRuntimeContext) |
39 | 39 | from .mpi_session import MpiPoolSession, external_mpi_comm_available |
40 | | -from .tokenizer import (TokenizerBase, _xgrammar_tokenizer_info) |
| 40 | +from .tokenizer import TokenizerBase, _xgrammar_tokenizer_info |
41 | 41 | # TODO[chunweiy]: move the following symbols back to utils scope, and remove the following import |
42 | 42 | from .utils import (append_docstring, exception_handler, get_device_count, |
43 | 43 | print_colored_debug, set_api_status) |
@@ -959,27 +959,8 @@ def _build_model(self): |
959 | 959 |
|
960 | 960 | assert isinstance(self.args, TorchLlmArgs) |
961 | 961 |
|
962 | | - # self._executor_config = tllm.ExecutorConfig( |
963 | | - # max_beam_width=self.args.max_beam_width, |
964 | | - # scheduler_config=PybindMirror.maybe_to_pybind( |
965 | | - # self.args.scheduler_config), |
966 | | - # max_batch_size=self.args.max_batch_size, |
967 | | - # max_num_tokens=self.args.max_num_tokens, |
968 | | - # gather_generation_logits=self.args.gather_generation_logits, |
969 | | - # fail_fast_on_attention_window_too_large=getattr( |
970 | | - # self.args, 'fail_fast_on_attention_window_too_large', False), |
971 | | - # **kwargs) |
972 | | - |
973 | | - # self._executor_config = self.args.get_executor_config(self._hf_model_dir) |
974 | | - |
975 | 962 | # TODO: revisit gather_context_logits |
976 | 963 | return_logits = self.args.gather_generation_logits |
977 | | - |
978 | | - print("---- self._executor_cls is: {}".format(self._executor_cls), |
979 | | - flush=True) |
980 | | - print("---- self._engine_dir is: {}".format(self._engine_dir), |
981 | | - flush=True) |
982 | | - |
983 | 964 | self._executor = self._executor_cls.create( |
984 | 965 | self._engine_dir, |
985 | 966 | executor_config=None, |
|
0 commit comments