IBM · njhill · May 8, 2024 · May 6, 2024 · May 6, 2024
diff --git a/vllm/entrypoints/grpc/grpc_server.py b/vllm/entrypoints/grpc/grpc_server.py
@@ -14,7 +14,7 @@
 from vllm import (AsyncLLMEngine, CompletionOutput, RequestOutput,
                   SamplingParams)
 from vllm.config import ModelConfig
-from vllm.entrypoints.grpc.pb import generation_pb2_grpc
+from vllm.entrypoints.grpc.pb import generation_pb2_grpc  # type: ignore
 # yapf: disable
 from vllm.entrypoints.grpc.pb.generation_pb2 import (BatchedGenerationRequest,
                                                      BatchedGenerationResponse,
@@ -54,15 +54,15 @@ async def _handle_exception(e: Exception, func, *args, **kwargs):
     if not isinstance(e, AbortError):
         if type(e).__name__ == "torch.cuda.OutOfMemoryError":  #TODO check
             context = kwargs.get("context", None) or args[-1]
-            logger.exception(f"{func.__name__} caused GPU OOM error")
+            logger.exception("%s caused GPU OOM error", func.__name__)
             service_metrics.count_request_failure(FailureReasonLabel.OOM)
             await context.abort(StatusCode.RESOURCE_EXHAUSTED, str(e))
         else:
             if "generate" in func.__name__.lower():
                 service_metrics.count_request_failure(FailureReasonLabel.GENERATE)
             else:
                 service_metrics.count_request_failure(FailureReasonLabel.UNKNOWN)
-        logger.exception(f"{func.__name__} failed")
+        logger.exception("%s failed", func.__name__)
     raise e
 
 
@@ -295,7 +295,7 @@ def _convert_output(self,
             text=output.text[text_start_offset:],
             generated_token_count=len(output.token_ids),
             stop_reason=stop_reason,
-            stop_sequence=stop_sequence,
+            stop_sequence=stop_sequence if stop_sequence else '',
         )
 
         if resp_options.generated_tokens:
@@ -413,7 +413,8 @@ async def _validate_and_convert_params(
 
     @staticmethod
     def _convert_reason(output: CompletionOutput, max_is_token_limit: bool,
-                        time_limit_reached: bool) -> Tuple['StopReason', str]:
+                        time_limit_reached: bool
+    ) -> Tuple[StopReason.ValueType, Optional[str]]:
         finish_reason = output.finish_reason
         stop_sequence = None
         if finish_reason is None:
@@ -433,20 +434,20 @@ def _convert_reason(output: CompletionOutput, max_is_token_limit: bool,
                     stop_sequence = stop_str_or_tok
                 else:
                     logger.warning(
-                        f"Unexpected stop_reason type: {type(stop_str_or_tok)}"
+                        "Unexpected stop_reason type: %s", type(stop_str_or_tok)
                     )
         elif finish_reason == "abort":
             stop_reason = StopReason.CANCELLED
         else:
-            logger.warning(f"Unrecognized finish_reason: {finish_reason}")
+            logger.warning("Unrecognized finish_reason: %s", finish_reason)
             stop_reason = StopReason.CANCELLED
 
         return stop_reason, stop_sequence
 
     def _convert_tokens(
         self,
-        token_ids: list[int],
-        logprobs_list: Optional[list[Dict[int, Logprob]]],
+        token_ids: List[int],
+        logprobs_list: Optional[List[Dict[int, Logprob]]],
         include_logprobs: bool,
         include_ranks: bool,
         top_n_tokens: int,
@@ -499,7 +500,7 @@ async def _validate_prompt_and_tokenize(
         #                    "max_length": truncate_input_tokens} \
         #     if truncate_input_tokens is not None else {
         #       "truncation": True, "max_length": max_model_len + 1}
-        tokenize_kwargs = {}
+        tokenize_kwargs: Dict[str, Any] = {}
 
         input_ids = await self.tokenizer_group.encode_async(
             prompt, **tokenize_kwargs)
@@ -661,6 +662,6 @@ async def start_grpc_server(engine: AsyncLLMEngine,
         server.add_insecure_port(listen_on)
 
     await server.start()
-    logger.info(f"gRPC Server started at {listen_on}")
+    logger.info("gRPC Server started at %s", listen_on)
 
     return server
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
@@ -1,4 +1,3 @@
-import argparse
 import asyncio
 import importlib
 import inspect

diff --git a/vllm/tgis_utils/args.py b/vllm/tgis_utils/args.py
@@ -129,10 +129,10 @@ def postprocess_tgis_args(args: argparse.Namespace) -> argparse.Namespace:
     if args.max_batch_size is not None:
         # Existing MAX_BATCH_SIZE settings in TGIS configs may not necessarily
         # be best for vLLM so we'll just log a warning for now
-        logger.warn(
-            f"max_batch_size is set to {args.max_batch_size} but will be "
-            f"ignored for now. max_num_seqs can be used if this is still "
-            f"needed.")
+        logger.warning(
+            "max_batch_size is set to %d but will be ignored for now."
+            "max_num_seqs can be used if this is still needed.",
+            args.max_batch_size)
 
     if args.tls_cert_path:
         args.ssl_certfile = args.tls_cert_path

diff --git a/vllm/tgis_utils/logs.py b/vllm/tgis_utils/logs.py
@@ -45,11 +45,10 @@ def log_response(inputs: List[str], params: Parameters, prefix_id: str,
         level = logging.WARN
     else:
         level = logging.INFO
-    logger.log(
-        level, f"{span_str}: {kind_log} generated "
-        f"{response.generated_token_count} tokens before "
-        f"{stop_reason_str}, output {output_len} chars: "
-        f"{short_output}")
+    logger.log(level,
+               "%s: %s generated %d tokens before %s, output %d chars: %s",
+               span_str, kind_log, response.generated_token_count,
+               stop_reason_str, output_len, short_output)
 
 
 def _truncate(text: str, len_: int) -> bytes: