diff --git a/docs/source/en/package_reference/inference_types.md b/docs/source/en/package_reference/inference_types.md
index 8dd826336c..f313294662 100644
--- a/docs/source/en/package_reference/inference_types.md
+++ b/docs/source/en/package_reference/inference_types.md
@@ -63,9 +63,9 @@ This part of the lib is still under development and will be improved in future r
 
 [[autodoc]] huggingface_hub.ChatCompletionInputMessageChunk
 
-[[autodoc]] huggingface_hub.ChatCompletionInputTool
+[[autodoc]] huggingface_hub.ChatCompletionInputStreamOptions
 
-[[autodoc]] huggingface_hub.ChatCompletionInputToolTypeClass
+[[autodoc]] huggingface_hub.ChatCompletionInputToolType
 
 [[autodoc]] huggingface_hub.ChatCompletionInputURL
 
@@ -103,6 +103,10 @@ This part of the lib is still under development and will be improved in future r
 
 [[autodoc]] huggingface_hub.ChatCompletionStreamOutputTopLogprob
 
+[[autodoc]] huggingface_hub.ChatCompletionStreamOutputUsage
+
+[[autodoc]] huggingface_hub.ToolElement
+
 
 
 ## depth_estimation
@@ -219,12 +223,12 @@ This part of the lib is still under development and will be improved in future r
 
 ## summarization
 
-[[autodoc]] huggingface_hub.SummarizationGenerationParameters
-
 [[autodoc]] huggingface_hub.SummarizationInput
 
 [[autodoc]] huggingface_hub.SummarizationOutput
 
+[[autodoc]] huggingface_hub.SummarizationParameters
+
 
 
 ## table_question_answering
@@ -307,6 +311,18 @@ This part of the lib is still under development and will be improved in future r
 
 
 
+## text_to_speech
+
+[[autodoc]] huggingface_hub.TextToSpeechGenerationParameters
+
+[[autodoc]] huggingface_hub.TextToSpeechInput
+
+[[autodoc]] huggingface_hub.TextToSpeechOutput
+
+[[autodoc]] huggingface_hub.TextToSpeechParameters
+
+
+
 ## token_classification
 
 [[autodoc]] huggingface_hub.TokenClassificationInput
@@ -319,12 +335,12 @@ This part of the lib is still under development and will be improved in future r
 
 ## translation
 
-[[autodoc]] huggingface_hub.TranslationGenerationParameters
-
 [[autodoc]] huggingface_hub.TranslationInput
 
 [[autodoc]] huggingface_hub.TranslationOutput
 
+[[autodoc]] huggingface_hub.TranslationParameters
+
 
 
 ## video_classification
diff --git a/docs/source/ko/package_reference/inference_types.md b/docs/source/ko/package_reference/inference_types.md
index 27e96cb491..ef4a62a570 100644
--- a/docs/source/ko/package_reference/inference_types.md
+++ b/docs/source/ko/package_reference/inference_types.md
@@ -62,9 +62,9 @@ rendered properly in your Markdown viewer.
 
 [[autodoc]] huggingface_hub.ChatCompletionInputMessageChunk
 
-[[autodoc]] huggingface_hub.ChatCompletionInputTool
+[[autodoc]] huggingface_hub.ChatCompletionInputStreamOptions
 
-[[autodoc]] huggingface_hub.ChatCompletionInputToolTypeClass
+[[autodoc]] huggingface_hub.ChatCompletionInputToolType
 
 [[autodoc]] huggingface_hub.ChatCompletionInputURL
 
@@ -102,6 +102,10 @@ rendered properly in your Markdown viewer.
 
 [[autodoc]] huggingface_hub.ChatCompletionStreamOutputTopLogprob
 
+[[autodoc]] huggingface_hub.ChatCompletionStreamOutputUsage
+
+[[autodoc]] huggingface_hub.ToolElement
+
 
 
 ## depth_estimation[[huggingface_hub.DepthEstimationInput]]
@@ -216,14 +220,14 @@ rendered properly in your Markdown viewer.
 
 
 
-## summarization[[huggingface_hub.SummarizationGenerationParameters]]
-
-[[autodoc]] huggingface_hub.SummarizationGenerationParameters
+## summarization[[huggingface_hub.SummarizationInput]]
 
 [[autodoc]] huggingface_hub.SummarizationInput
 
 [[autodoc]] huggingface_hub.SummarizationOutput
 
+[[autodoc]] huggingface_hub.SummarizationParameters
+
 
 
 ## table_question_answering[[huggingface_hub.TableQuestionAnsweringInput]]
@@ -306,6 +310,18 @@ rendered properly in your Markdown viewer.
 
 
 
+## text_to_speech[[huggingface_hub.TextToSpeechGenerationParameters]]
+
+[[autodoc]] huggingface_hub.TextToSpeechGenerationParameters
+
+[[autodoc]] huggingface_hub.TextToSpeechInput
+
+[[autodoc]] huggingface_hub.TextToSpeechOutput
+
+[[autodoc]] huggingface_hub.TextToSpeechParameters
+
+
+
 ## token_classification[[huggingface_hub.TokenClassificationInput]]
 
 [[autodoc]] huggingface_hub.TokenClassificationInput
@@ -316,14 +332,14 @@ rendered properly in your Markdown viewer.
 
 
 
-## translation[[huggingface_hub.TranslationGenerationParameters]]
-
-[[autodoc]] huggingface_hub.TranslationGenerationParameters
+## translation[[huggingface_hub.TranslationInput]]
 
 [[autodoc]] huggingface_hub.TranslationInput
 
 [[autodoc]] huggingface_hub.TranslationOutput
 
+[[autodoc]] huggingface_hub.TranslationParameters
+
 
 
 ## video_classification[[huggingface_hub.VideoClassificationInput]]
diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py
index 413fac30fd..0d18e331db 100644
--- a/src/huggingface_hub/__init__.py
+++ b/src/huggingface_hub/__init__.py
@@ -294,8 +294,8 @@
         "ChatCompletionInputGrammarType",
         "ChatCompletionInputMessage",
         "ChatCompletionInputMessageChunk",
-        "ChatCompletionInputTool",
-        "ChatCompletionInputToolTypeClass",
+        "ChatCompletionInputStreamOptions",
+        "ChatCompletionInputToolType",
         "ChatCompletionInputURL",
         "ChatCompletionOutput",
         "ChatCompletionOutputComplete",
@@ -314,6 +314,7 @@
         "ChatCompletionStreamOutputLogprob",
         "ChatCompletionStreamOutputLogprobs",
         "ChatCompletionStreamOutputTopLogprob",
+        "ChatCompletionStreamOutputUsage",
         "DepthEstimationInput",
         "DepthEstimationOutput",
         "DocumentQuestionAnsweringInput",
@@ -348,9 +349,9 @@
         "QuestionAnsweringParameters",
         "SentenceSimilarityInput",
         "SentenceSimilarityInputData",
-        "SummarizationGenerationParameters",
         "SummarizationInput",
         "SummarizationOutput",
+        "SummarizationParameters",
         "TableQuestionAnsweringInput",
         "TableQuestionAnsweringInputData",
         "TableQuestionAnsweringOutputElement",
@@ -379,12 +380,17 @@
         "TextToImageOutput",
         "TextToImageParameters",
         "TextToImageTargetSize",
+        "TextToSpeechGenerationParameters",
+        "TextToSpeechInput",
+        "TextToSpeechOutput",
+        "TextToSpeechParameters",
         "TokenClassificationInput",
         "TokenClassificationOutputElement",
         "TokenClassificationParameters",
-        "TranslationGenerationParameters",
+        "ToolElement",
         "TranslationInput",
         "TranslationOutput",
+        "TranslationParameters",
         "VideoClassificationInput",
         "VideoClassificationOutputElement",
         "VideoClassificationParameters",
@@ -802,8 +808,8 @@ def __dir__():
         ChatCompletionInputGrammarType,  # noqa: F401
         ChatCompletionInputMessage,  # noqa: F401
         ChatCompletionInputMessageChunk,  # noqa: F401
-        ChatCompletionInputTool,  # noqa: F401
-        ChatCompletionInputToolTypeClass,  # noqa: F401
+        ChatCompletionInputStreamOptions,  # noqa: F401
+        ChatCompletionInputToolType,  # noqa: F401
         ChatCompletionInputURL,  # noqa: F401
         ChatCompletionOutput,  # noqa: F401
         ChatCompletionOutputComplete,  # noqa: F401
@@ -822,6 +828,7 @@ def __dir__():
         ChatCompletionStreamOutputLogprob,  # noqa: F401
         ChatCompletionStreamOutputLogprobs,  # noqa: F401
         ChatCompletionStreamOutputTopLogprob,  # noqa: F401
+        ChatCompletionStreamOutputUsage,  # noqa: F401
         DepthEstimationInput,  # noqa: F401
         DepthEstimationOutput,  # noqa: F401
         DocumentQuestionAnsweringInput,  # noqa: F401
@@ -856,9 +863,9 @@ def __dir__():
         QuestionAnsweringParameters,  # noqa: F401
         SentenceSimilarityInput,  # noqa: F401
         SentenceSimilarityInputData,  # noqa: F401
-        SummarizationGenerationParameters,  # noqa: F401
         SummarizationInput,  # noqa: F401
         SummarizationOutput,  # noqa: F401
+        SummarizationParameters,  # noqa: F401
         TableQuestionAnsweringInput,  # noqa: F401
         TableQuestionAnsweringInputData,  # noqa: F401
         TableQuestionAnsweringOutputElement,  # noqa: F401
@@ -887,12 +894,17 @@ def __dir__():
         TextToImageOutput,  # noqa: F401
         TextToImageParameters,  # noqa: F401
         TextToImageTargetSize,  # noqa: F401
+        TextToSpeechGenerationParameters,  # noqa: F401
+        TextToSpeechInput,  # noqa: F401
+        TextToSpeechOutput,  # noqa: F401
+        TextToSpeechParameters,  # noqa: F401
         TokenClassificationInput,  # noqa: F401
         TokenClassificationOutputElement,  # noqa: F401
         TokenClassificationParameters,  # noqa: F401
-        TranslationGenerationParameters,  # noqa: F401
+        ToolElement,  # noqa: F401
         TranslationInput,  # noqa: F401
         TranslationOutput,  # noqa: F401
+        TranslationParameters,  # noqa: F401
         VideoClassificationInput,  # noqa: F401
         VideoClassificationOutputElement,  # noqa: F401
         VideoClassificationParameters,  # noqa: F401
diff --git a/src/huggingface_hub/inference/_client.py b/src/huggingface_hub/inference/_client.py
index f642176675..85f680355d 100644
--- a/src/huggingface_hub/inference/_client.py
+++ b/src/huggingface_hub/inference/_client.py
@@ -77,8 +77,8 @@
     AudioToAudioOutputElement,
     AutomaticSpeechRecognitionOutput,
     ChatCompletionInputGrammarType,
-    ChatCompletionInputTool,
-    ChatCompletionInputToolTypeClass,
+    ChatCompletionInputStreamOptions,
+    ChatCompletionInputToolType,
     ChatCompletionOutput,
     ChatCompletionStreamOutput,
     DocumentQuestionAnsweringOutputElement,
@@ -95,6 +95,7 @@
     TextGenerationOutput,
     TextGenerationStreamOutput,
     TokenClassificationOutputElement,
+    ToolElement,
     TranslationOutput,
     VisualQuestionAnsweringOutputElement,
     ZeroShotClassificationOutputElement,
@@ -452,7 +453,7 @@ def automatic_speech_recognition(
     @overload
     def chat_completion(  # type: ignore
         self,
-        messages: List[Dict[str, str]],
+        messages: List[Dict],
         *,
         model: Optional[str] = None,
         stream: Literal[False] = False,
@@ -465,10 +466,11 @@ def chat_completion(  # type: ignore
         response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
         stop: Optional[List[str]] = None,
+        stream_options: Optional[ChatCompletionInputStreamOptions] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolType, str]] = None,
         tool_prompt: Optional[str] = None,
-        tools: Optional[List[ChatCompletionInputTool]] = None,
+        tools: Optional[List[ToolElement]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
     ) -> ChatCompletionOutput: ...
@@ -476,7 +478,7 @@ def chat_completion(  # type: ignore
     @overload
     def chat_completion(  # type: ignore
         self,
-        messages: List[Dict[str, str]],
+        messages: List[Dict],
         *,
         model: Optional[str] = None,
         stream: Literal[True] = True,
@@ -489,10 +491,11 @@ def chat_completion(  # type: ignore
         response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
         stop: Optional[List[str]] = None,
+        stream_options: Optional[ChatCompletionInputStreamOptions] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolType, str]] = None,
         tool_prompt: Optional[str] = None,
-        tools: Optional[List[ChatCompletionInputTool]] = None,
+        tools: Optional[List[ToolElement]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
     ) -> Iterable[ChatCompletionStreamOutput]: ...
@@ -500,7 +503,7 @@ def chat_completion(  # type: ignore
     @overload
     def chat_completion(
         self,
-        messages: List[Dict[str, str]],
+        messages: List[Dict],
         *,
         model: Optional[str] = None,
         stream: bool = False,
@@ -513,17 +516,18 @@ def chat_completion(
         response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
         stop: Optional[List[str]] = None,
+        stream_options: Optional[ChatCompletionInputStreamOptions] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolType, str]] = None,
         tool_prompt: Optional[str] = None,
-        tools: Optional[List[ChatCompletionInputTool]] = None,
+        tools: Optional[List[ToolElement]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
     ) -> Union[ChatCompletionOutput, Iterable[ChatCompletionStreamOutput]]: ...
 
     def chat_completion(
         self,
-        messages: List[Dict[str, str]],
+        messages: List[Dict],
         *,
         model: Optional[str] = None,
         stream: bool = False,
@@ -537,10 +541,11 @@ def chat_completion(
         response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
         stop: Optional[List[str]] = None,
+        stream_options: Optional[ChatCompletionInputStreamOptions] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolType, str]] = None,
         tool_prompt: Optional[str] = None,
-        tools: Optional[List[ChatCompletionInputTool]] = None,
+        tools: Optional[List[ToolElement]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
     ) -> Union[ChatCompletionOutput, Iterable[ChatCompletionStreamOutput]]:
@@ -557,7 +562,7 @@ def chat_completion(
         </Tip>
 
         Args:
-            messages (List[Union[`SystemMessage`, `UserMessage`, `AssistantMessage`]]):
+            messages (List of [`ChatCompletionInputMessage`]):
                 Conversation history consisting of roles and content pairs.
             model (`str`, *optional*):
                 The model to use for chat-completion. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
@@ -594,6 +599,8 @@ def chat_completion(
                 Defaults to None.
             stream (`bool`, *optional*):
                 Enable realtime streaming of responses. Defaults to False.
+            stream_options ([`ChatCompletionInputStreamOptions`], *optional*):
+                Options for streaming completions.
             temperature (`float`, *optional*):
                 Controls randomness of the generations. Lower values ensure
                 less random completions. Range: [0, 2]. Defaults to 1.0.
@@ -604,11 +611,11 @@ def chat_completion(
             top_p (`float`, *optional*):
                 Fraction of the most likely next words to sample from.
                 Must be between 0 and 1. Defaults to 1.0.
-            tool_choice ([`ChatCompletionInputToolTypeClass`] or `str`, *optional*):
+            tool_choice ([`ChatCompletionInputToolType`] or `str`, *optional*):
                 The tool to use for the completion. Defaults to "auto".
             tool_prompt (`str`, *optional*):
                 A prompt to be appended before the tools.
-            tools (List of [`ChatCompletionInputTool`], *optional*):
+            tools (List of [`ToolElement`], *optional*):
                 A list of tools the model may call. Currently, only functions are supported as a tool. Use this to
                 provide a list of functions the model may generate JSON inputs for.
 
@@ -658,7 +665,7 @@ def chat_completion(
         )
         ```
 
-        Example (stream=True):
+        Example using streaming:
         ```py
         >>> from huggingface_hub import InferenceClient
         >>> messages = [{"role": "user", "content": "What is the capital of France?"}]
@@ -696,6 +703,40 @@ def chat_completion(
             print(chunk.choices[0].delta.content)
         ```
 
+        Example using Image + Text as input:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+
+        # provide a remote URL
+        >>> image_url ="https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+        # or a base64-encoded image
+        >>> image_path = "/path/to/image.jpeg"
+        >>> with open(image_path, "rb") as f:
+        ...     base64_image = base64.b64encode(f.read()).decode("utf-8")
+        >>> image_url = f"data:image/jpeg;base64,{base64_image}"
+
+        >>> client = InferenceClient("HuggingFaceM4/idefics2-8b-chatty")
+        >>> output = client.chat.completions.create(
+        ...     messages=[
+        ...         {
+        ...             "role": "user",
+        ...             "content": [
+        ...                 {
+        ...                     "type": "image_url",
+        ...                     "image_url": {"url": image_url},
+        ...                 },
+        ...                 {
+        ...                     "type": "text",
+        ...                     "text": "Describe this image in one sentence.",
+        ...                 },
+        ...             ],
+        ...         },
+        ...     ],
+        ... )
+        >>> output
+        A determine figure of Lady Liberty stands tall, holding a torch aloft, atop a pedestal on an island.
+        ```
+
         Example using tools:
         ```py
         >>> client = InferenceClient("meta-llama/Meta-Llama-3-70B-Instruct")
@@ -837,6 +878,7 @@ def chat_completion(
             top_logprobs=top_logprobs,
             top_p=top_p,
             stream=stream,
+            stream_options=stream_options,
         )
         payload = {key: value for key, value in payload.items() if value is not None}
         data = self.post(model=model_url, json=payload, stream=stream)
@@ -1094,7 +1136,7 @@ def image_segmentation(
         response = self.post(data=image, model=model, task="image-segmentation")
         output = ImageSegmentationOutputElement.parse_obj_as_list(response)
         for item in output:
-            item.mask = _b64_to_image(item.mask)
+            item.mask = _b64_to_image(item.mask)  # type: ignore [assignment]
         return output
 
     def image_to_image(
diff --git a/src/huggingface_hub/inference/_generated/_async_client.py b/src/huggingface_hub/inference/_generated/_async_client.py
index 095cb376a6..c5e1cbbcab 100644
--- a/src/huggingface_hub/inference/_generated/_async_client.py
+++ b/src/huggingface_hub/inference/_generated/_async_client.py
@@ -64,8 +64,8 @@
     AudioToAudioOutputElement,
     AutomaticSpeechRecognitionOutput,
     ChatCompletionInputGrammarType,
-    ChatCompletionInputTool,
-    ChatCompletionInputToolTypeClass,
+    ChatCompletionInputStreamOptions,
+    ChatCompletionInputToolType,
     ChatCompletionOutput,
     ChatCompletionStreamOutput,
     DocumentQuestionAnsweringOutputElement,
@@ -82,6 +82,7 @@
     TextGenerationOutput,
     TextGenerationStreamOutput,
     TokenClassificationOutputElement,
+    ToolElement,
     TranslationOutput,
     VisualQuestionAnsweringOutputElement,
     ZeroShotClassificationOutputElement,
@@ -487,7 +488,7 @@ async def automatic_speech_recognition(
     @overload
     async def chat_completion(  # type: ignore
         self,
-        messages: List[Dict[str, str]],
+        messages: List[Dict],
         *,
         model: Optional[str] = None,
         stream: Literal[False] = False,
@@ -500,10 +501,11 @@ async def chat_completion(  # type: ignore
         response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
         stop: Optional[List[str]] = None,
+        stream_options: Optional[ChatCompletionInputStreamOptions] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolType, str]] = None,
         tool_prompt: Optional[str] = None,
-        tools: Optional[List[ChatCompletionInputTool]] = None,
+        tools: Optional[List[ToolElement]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
     ) -> ChatCompletionOutput: ...
@@ -511,7 +513,7 @@ async def chat_completion(  # type: ignore
     @overload
     async def chat_completion(  # type: ignore
         self,
-        messages: List[Dict[str, str]],
+        messages: List[Dict],
         *,
         model: Optional[str] = None,
         stream: Literal[True] = True,
@@ -524,10 +526,11 @@ async def chat_completion(  # type: ignore
         response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
         stop: Optional[List[str]] = None,
+        stream_options: Optional[ChatCompletionInputStreamOptions] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolType, str]] = None,
         tool_prompt: Optional[str] = None,
-        tools: Optional[List[ChatCompletionInputTool]] = None,
+        tools: Optional[List[ToolElement]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
     ) -> AsyncIterable[ChatCompletionStreamOutput]: ...
@@ -535,7 +538,7 @@ async def chat_completion(  # type: ignore
     @overload
     async def chat_completion(
         self,
-        messages: List[Dict[str, str]],
+        messages: List[Dict],
         *,
         model: Optional[str] = None,
         stream: bool = False,
@@ -548,17 +551,18 @@ async def chat_completion(
         response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
         stop: Optional[List[str]] = None,
+        stream_options: Optional[ChatCompletionInputStreamOptions] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolType, str]] = None,
         tool_prompt: Optional[str] = None,
-        tools: Optional[List[ChatCompletionInputTool]] = None,
+        tools: Optional[List[ToolElement]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
     ) -> Union[ChatCompletionOutput, AsyncIterable[ChatCompletionStreamOutput]]: ...
 
     async def chat_completion(
         self,
-        messages: List[Dict[str, str]],
+        messages: List[Dict],
         *,
         model: Optional[str] = None,
         stream: bool = False,
@@ -572,10 +576,11 @@ async def chat_completion(
         response_format: Optional[ChatCompletionInputGrammarType] = None,
         seed: Optional[int] = None,
         stop: Optional[List[str]] = None,
+        stream_options: Optional[ChatCompletionInputStreamOptions] = None,
         temperature: Optional[float] = None,
-        tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None,
+        tool_choice: Optional[Union[ChatCompletionInputToolType, str]] = None,
         tool_prompt: Optional[str] = None,
-        tools: Optional[List[ChatCompletionInputTool]] = None,
+        tools: Optional[List[ToolElement]] = None,
         top_logprobs: Optional[int] = None,
         top_p: Optional[float] = None,
     ) -> Union[ChatCompletionOutput, AsyncIterable[ChatCompletionStreamOutput]]:
@@ -592,7 +597,7 @@ async def chat_completion(
         </Tip>
 
         Args:
-            messages (List[Union[`SystemMessage`, `UserMessage`, `AssistantMessage`]]):
+            messages (List of [`ChatCompletionInputMessage`]):
                 Conversation history consisting of roles and content pairs.
             model (`str`, *optional*):
                 The model to use for chat-completion. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
@@ -629,6 +634,8 @@ async def chat_completion(
                 Defaults to None.
             stream (`bool`, *optional*):
                 Enable realtime streaming of responses. Defaults to False.
+            stream_options ([`ChatCompletionInputStreamOptions`], *optional*):
+                Options for streaming completions.
             temperature (`float`, *optional*):
                 Controls randomness of the generations. Lower values ensure
                 less random completions. Range: [0, 2]. Defaults to 1.0.
@@ -639,11 +646,11 @@ async def chat_completion(
             top_p (`float`, *optional*):
                 Fraction of the most likely next words to sample from.
                 Must be between 0 and 1. Defaults to 1.0.
-            tool_choice ([`ChatCompletionInputToolTypeClass`] or `str`, *optional*):
+            tool_choice ([`ChatCompletionInputToolType`] or `str`, *optional*):
                 The tool to use for the completion. Defaults to "auto".
             tool_prompt (`str`, *optional*):
                 A prompt to be appended before the tools.
-            tools (List of [`ChatCompletionInputTool`], *optional*):
+            tools (List of [`ToolElement`], *optional*):
                 A list of tools the model may call. Currently, only functions are supported as a tool. Use this to
                 provide a list of functions the model may generate JSON inputs for.
 
@@ -694,7 +701,7 @@ async def chat_completion(
         )
         ```
 
-        Example (stream=True):
+        Example using streaming:
         ```py
         # Must be run in an async context
         >>> from huggingface_hub import AsyncInferenceClient
@@ -734,6 +741,41 @@ async def chat_completion(
             print(chunk.choices[0].delta.content)
         ```
 
+        Example using Image + Text as input:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+
+        # provide a remote URL
+        >>> image_url ="https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
+        # or a base64-encoded image
+        >>> image_path = "/path/to/image.jpeg"
+        >>> with open(image_path, "rb") as f:
+        ...     base64_image = base64.b64encode(f.read()).decode("utf-8")
+        >>> image_url = f"data:image/jpeg;base64,{base64_image}"
+
+        >>> client = AsyncInferenceClient("HuggingFaceM4/idefics2-8b-chatty")
+        >>> output = await client.chat.completions.create(
+        ...     messages=[
+        ...         {
+        ...             "role": "user",
+        ...             "content": [
+        ...                 {
+        ...                     "type": "image_url",
+        ...                     "image_url": {"url": image_url},
+        ...                 },
+        ...                 {
+        ...                     "type": "text",
+        ...                     "text": "Describe this image in one sentence.",
+        ...                 },
+        ...             ],
+        ...         },
+        ...     ],
+        ... )
+        >>> output
+        A determine figure of Lady Liberty stands tall, holding a torch aloft, atop a pedestal on an island.
+        ```
+
         Example using tools:
         ```py
         # Must be run in an async context
@@ -877,6 +919,7 @@ async def chat_completion(
             top_logprobs=top_logprobs,
             top_p=top_p,
             stream=stream,
+            stream_options=stream_options,
         )
         payload = {key: value for key, value in payload.items() if value is not None}
         data = await self.post(model=model_url, json=payload, stream=stream)
@@ -1139,7 +1182,7 @@ async def image_segmentation(
         response = await self.post(data=image, model=model, task="image-segmentation")
         output = ImageSegmentationOutputElement.parse_obj_as_list(response)
         for item in output:
-            item.mask = _b64_to_image(item.mask)
+            item.mask = _b64_to_image(item.mask)  # type: ignore [assignment]
         return output
 
     async def image_to_image(
diff --git a/src/huggingface_hub/inference/_generated/types/__init__.py b/src/huggingface_hub/inference/_generated/types/__init__.py
index db2793be23..057a491f46 100644
--- a/src/huggingface_hub/inference/_generated/types/__init__.py
+++ b/src/huggingface_hub/inference/_generated/types/__init__.py
@@ -24,8 +24,8 @@
     ChatCompletionInputGrammarType,
     ChatCompletionInputMessage,
     ChatCompletionInputMessageChunk,
-    ChatCompletionInputTool,
-    ChatCompletionInputToolTypeClass,
+    ChatCompletionInputStreamOptions,
+    ChatCompletionInputToolType,
     ChatCompletionInputURL,
     ChatCompletionOutput,
     ChatCompletionOutputComplete,
@@ -44,6 +44,8 @@
     ChatCompletionStreamOutputLogprob,
     ChatCompletionStreamOutputLogprobs,
     ChatCompletionStreamOutputTopLogprob,
+    ChatCompletionStreamOutputUsage,
+    ToolElement,
 )
 from .depth_estimation import DepthEstimationInput, DepthEstimationOutput
 from .document_question_answering import (
@@ -75,7 +77,7 @@
     QuestionAnsweringParameters,
 )
 from .sentence_similarity import SentenceSimilarityInput, SentenceSimilarityInputData
-from .summarization import SummarizationGenerationParameters, SummarizationInput, SummarizationOutput
+from .summarization import SummarizationInput, SummarizationOutput, SummarizationParameters
 from .table_question_answering import (
     TableQuestionAnsweringInput,
     TableQuestionAnsweringInputData,
@@ -98,12 +100,18 @@
 )
 from .text_to_audio import TextToAudioGenerationParameters, TextToAudioInput, TextToAudioOutput, TextToAudioParameters
 from .text_to_image import TextToImageInput, TextToImageOutput, TextToImageParameters, TextToImageTargetSize
+from .text_to_speech import (
+    TextToSpeechGenerationParameters,
+    TextToSpeechInput,
+    TextToSpeechOutput,
+    TextToSpeechParameters,
+)
 from .token_classification import (
     TokenClassificationInput,
     TokenClassificationOutputElement,
     TokenClassificationParameters,
 )
-from .translation import TranslationGenerationParameters, TranslationInput, TranslationOutput
+from .translation import TranslationInput, TranslationOutput, TranslationParameters
 from .video_classification import (
     VideoClassificationInput,
     VideoClassificationOutputElement,
diff --git a/src/huggingface_hub/inference/_generated/types/audio_classification.py b/src/huggingface_hub/inference/_generated/types/audio_classification.py
index 914ba44960..f828c980cb 100644
--- a/src/huggingface_hub/inference/_generated/types/audio_classification.py
+++ b/src/huggingface_hub/inference/_generated/types/audio_classification.py
@@ -4,7 +4,7 @@
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
 from dataclasses import dataclass
-from typing import Any, Literal, Optional
+from typing import Literal, Optional
 
 from .base import BaseInferenceType
 
@@ -27,8 +27,10 @@ class AudioClassificationParameters(BaseInferenceType):
 class AudioClassificationInput(BaseInferenceType):
     """Inputs for Audio Classification inference"""
 
-    inputs: Any
-    """The input audio data"""
+    inputs: str
+    """The input audio data as a base64-encoded string. If no `parameters` are provided, you can
+    also provide the audio data as a raw bytes payload.
+    """
     parameters: Optional[AudioClassificationParameters] = None
     """Additional inference parameters"""
 
diff --git a/src/huggingface_hub/inference/_generated/types/automatic_speech_recognition.py b/src/huggingface_hub/inference/_generated/types/automatic_speech_recognition.py
index 24a5238ab6..29323bf2a9 100644
--- a/src/huggingface_hub/inference/_generated/types/automatic_speech_recognition.py
+++ b/src/huggingface_hub/inference/_generated/types/automatic_speech_recognition.py
@@ -4,7 +4,7 @@
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
 from dataclasses import dataclass
-from typing import Any, List, Literal, Optional, Union
+from typing import List, Literal, Optional, Union
 
 from .base import BaseInferenceType
 
@@ -90,8 +90,10 @@ class AutomaticSpeechRecognitionParameters(BaseInferenceType):
 class AutomaticSpeechRecognitionInput(BaseInferenceType):
     """Inputs for Automatic Speech Recognition inference"""
 
-    inputs: Any
-    """The input audio data"""
+    inputs: str
+    """The input audio data as a base64-encoded string. If no `parameters` are provided, you can
+    also provide the audio data as a raw bytes payload.
+    """
     parameters: Optional[AutomaticSpeechRecognitionParameters] = None
     """Additional inference parameters"""
 
diff --git a/src/huggingface_hub/inference/_generated/types/chat_completion.py b/src/huggingface_hub/inference/_generated/types/chat_completion.py
index fa6e373140..7a1f297e4f 100644
--- a/src/huggingface_hub/inference/_generated/types/chat_completion.py
+++ b/src/huggingface_hub/inference/_generated/types/chat_completion.py
@@ -44,13 +44,23 @@ class ChatCompletionInputGrammarType(BaseInferenceType):
     """
 
 
+@dataclass
+class ChatCompletionInputStreamOptions(BaseInferenceType):
+    include_usage: bool
+    """If set, an additional chunk will be streamed before the data: [DONE] message. The usage
+    field on this chunk shows the token usage statistics for the entire request, and the
+    choices field will always be an empty array. All other chunks will also include a usage
+    field, but with a null value.
+    """
+
+
 @dataclass
 class ChatCompletionInputFunctionName(BaseInferenceType):
     name: str
 
 
 @dataclass
-class ChatCompletionInputToolTypeClass(BaseInferenceType):
+class ChatCompletionInputToolType(BaseInferenceType):
     function: Optional[ChatCompletionInputFunctionName] = None
 
 
@@ -62,7 +72,7 @@ class ChatCompletionInputFunctionDefinition(BaseInferenceType):
 
 
 @dataclass
-class ChatCompletionInputTool(BaseInferenceType):
+class ToolElement(BaseInferenceType):
     function: ChatCompletionInputFunctionDefinition
     type: str
 
@@ -121,16 +131,17 @@ class ChatCompletionInput(BaseInferenceType):
     stop: Optional[List[str]] = None
     """Up to 4 sequences where the API will stop generating further tokens."""
     stream: Optional[bool] = None
+    stream_options: Optional[ChatCompletionInputStreamOptions] = None
     temperature: Optional[float] = None
     """What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the
     output more random, while
     lower values like 0.2 will make it more focused and deterministic.
     We generally recommend altering this or `top_p` but not both.
     """
-    tool_choice: Optional[Union[ChatCompletionInputToolTypeClass, str]] = None
+    tool_choice: Optional[Union[ChatCompletionInputToolType, str]] = None
     tool_prompt: Optional[str] = None
     """A prompt to be appended before the tools"""
-    tools: Optional[List[ChatCompletionInputTool]] = None
+    tools: Optional[List[ToolElement]] = None
     """A list of tools the model may call. Currently, only functions are supported as a tool.
     Use this to provide a list of
     functions the model may generate JSON inputs for.
@@ -265,6 +276,13 @@ class ChatCompletionStreamOutputChoice(BaseInferenceType):
     logprobs: Optional[ChatCompletionStreamOutputLogprobs] = None
 
 
+@dataclass
+class ChatCompletionStreamOutputUsage(BaseInferenceType):
+    completion_tokens: int
+    prompt_tokens: int
+    total_tokens: int
+
+
 @dataclass
 class ChatCompletionStreamOutput(BaseInferenceType):
     """Chat Completion Stream Output.
@@ -278,3 +296,4 @@ class ChatCompletionStreamOutput(BaseInferenceType):
     id: str
     model: str
     system_fingerprint: str
+    usage: Optional[ChatCompletionStreamOutputUsage] = None
diff --git a/src/huggingface_hub/inference/_generated/types/image_classification.py b/src/huggingface_hub/inference/_generated/types/image_classification.py
index fd52db005a..91b24d2c0b 100644
--- a/src/huggingface_hub/inference/_generated/types/image_classification.py
+++ b/src/huggingface_hub/inference/_generated/types/image_classification.py
@@ -4,7 +4,7 @@
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
 from dataclasses import dataclass
-from typing import Any, Literal, Optional
+from typing import Literal, Optional
 
 from .base import BaseInferenceType
 
@@ -27,8 +27,10 @@ class ImageClassificationParameters(BaseInferenceType):
 class ImageClassificationInput(BaseInferenceType):
     """Inputs for Image Classification inference"""
 
-    inputs: Any
-    """The input image data"""
+    inputs: str
+    """The input image data as a base64-encoded string. If no `parameters` are provided, you can
+    also provide the image data as a raw bytes payload.
+    """
     parameters: Optional[ImageClassificationParameters] = None
     """Additional inference parameters"""
 
diff --git a/src/huggingface_hub/inference/_generated/types/image_segmentation.py b/src/huggingface_hub/inference/_generated/types/image_segmentation.py
index 67dd7c28b3..25781059ab 100644
--- a/src/huggingface_hub/inference/_generated/types/image_segmentation.py
+++ b/src/huggingface_hub/inference/_generated/types/image_segmentation.py
@@ -4,7 +4,7 @@
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
 from dataclasses import dataclass
-from typing import Any, Literal, Optional
+from typing import Literal, Optional
 
 from .base import BaseInferenceType
 
@@ -32,8 +32,10 @@ class ImageSegmentationParameters(BaseInferenceType):
 class ImageSegmentationInput(BaseInferenceType):
     """Inputs for Image Segmentation inference"""
 
-    inputs: Any
-    """The input image data"""
+    inputs: str
+    """The input image data as a base64-encoded string. If no `parameters` are provided, you can
+    also provide the image data as a raw bytes payload.
+    """
     parameters: Optional[ImageSegmentationParameters] = None
     """Additional inference parameters"""
 
@@ -45,8 +47,8 @@ class ImageSegmentationOutputElement(BaseInferenceType):
     """
 
     label: str
-    """The label of the predicted segment"""
-    mask: Any
-    """The corresponding mask as a black-and-white image"""
+    """The label of the predicted segment."""
+    mask: str
+    """The corresponding mask as a black-and-white image (base64-encoded)."""
     score: Optional[float] = None
-    """The score or confidence degreee the model has"""
+    """The score or confidence degree the model has."""
diff --git a/src/huggingface_hub/inference/_generated/types/image_to_image.py b/src/huggingface_hub/inference/_generated/types/image_to_image.py
index 8c208ede6f..3bfe2983e3 100644
--- a/src/huggingface_hub/inference/_generated/types/image_to_image.py
+++ b/src/huggingface_hub/inference/_generated/types/image_to_image.py
@@ -11,7 +11,7 @@
 
 @dataclass
 class ImageToImageTargetSize(BaseInferenceType):
-    """The size in pixel of the output image"""
+    """The size in pixel of the output image."""
 
     height: int
     width: int
@@ -34,15 +34,17 @@ class ImageToImageParameters(BaseInferenceType):
     a higher quality image at the expense of slower inference.
     """
     target_size: Optional[ImageToImageTargetSize] = None
-    """The size in pixel of the output image"""
+    """The size in pixel of the output image."""
 
 
 @dataclass
 class ImageToImageInput(BaseInferenceType):
     """Inputs for Image To Image inference"""
 
-    inputs: Any
-    """The input image data"""
+    inputs: str
+    """The input image data as a base64-encoded string. If no `parameters` are provided, you can
+    also provide the image data as a raw bytes payload.
+    """
     parameters: Optional[ImageToImageParameters] = None
     """Additional inference parameters"""
 
@@ -52,4 +54,4 @@ class ImageToImageOutput(BaseInferenceType):
     """Outputs of inference for the Image To Image task"""
 
     image: Any
-    """The output image"""
+    """The output image returned as raw bytes in the payload."""
diff --git a/src/huggingface_hub/inference/_generated/types/object_detection.py b/src/huggingface_hub/inference/_generated/types/object_detection.py
index 42b03a841b..22d066268d 100644
--- a/src/huggingface_hub/inference/_generated/types/object_detection.py
+++ b/src/huggingface_hub/inference/_generated/types/object_detection.py
@@ -4,7 +4,7 @@
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
 from dataclasses import dataclass
-from typing import Any, Optional
+from typing import Optional
 
 from .base import BaseInferenceType
 
@@ -23,8 +23,10 @@ class ObjectDetectionParameters(BaseInferenceType):
 class ObjectDetectionInput(BaseInferenceType):
     """Inputs for Object Detection inference"""
 
-    inputs: Any
-    """The input image data"""
+    inputs: str
+    """The input image data as a base64-encoded string. If no `parameters` are provided, you can
+    also provide the image data as a raw bytes payload.
+    """
     parameters: Optional[ObjectDetectionParameters] = None
     """Additional inference parameters"""
 
@@ -36,9 +38,13 @@ class ObjectDetectionBoundingBox(BaseInferenceType):
     """
 
     xmax: int
+    """The x-coordinate of the bottom-right corner of the bounding box."""
     xmin: int
+    """The x-coordinate of the top-left corner of the bounding box."""
     ymax: int
+    """The y-coordinate of the bottom-right corner of the bounding box."""
     ymin: int
+    """The y-coordinate of the top-left corner of the bounding box."""
 
 
 @dataclass
@@ -50,6 +56,6 @@ class ObjectDetectionOutputElement(BaseInferenceType):
     image.
     """
     label: str
-    """The predicted label for the bounding box"""
+    """The predicted label for the bounding box."""
     score: float
-    """The associated score / probability"""
+    """The associated score / probability."""
diff --git a/src/huggingface_hub/inference/_generated/types/summarization.py b/src/huggingface_hub/inference/_generated/types/summarization.py
index a6a00e5326..7bc546b4cb 100644
--- a/src/huggingface_hub/inference/_generated/types/summarization.py
+++ b/src/huggingface_hub/inference/_generated/types/summarization.py
@@ -9,33 +9,31 @@
 from .base import BaseInferenceType
 
 
-SummarizationGenerationTruncationStrategy = Literal["do_not_truncate", "longest_first", "only_first", "only_second"]
+SummarizationTruncationStrategy = Literal["do_not_truncate", "longest_first", "only_first", "only_second"]
 
 
 @dataclass
-class SummarizationGenerationParameters(BaseInferenceType):
-    """Additional inference parameters
-    Additional inference parameters for Text2text Generation
+class SummarizationParameters(BaseInferenceType):
+    """Additional inference parameters.
+    Additional inference parameters for summarization.
     """
 
     clean_up_tokenization_spaces: Optional[bool] = None
     """Whether to clean up the potential extra spaces in the text output."""
     generate_parameters: Optional[Dict[str, Any]] = None
-    """Additional parametrization of the text generation algorithm"""
-    truncation: Optional["SummarizationGenerationTruncationStrategy"] = None
-    """The truncation strategy to use"""
+    """Additional parametrization of the text generation algorithm."""
+    truncation: Optional["SummarizationTruncationStrategy"] = None
+    """The truncation strategy to use."""
 
 
 @dataclass
 class SummarizationInput(BaseInferenceType):
-    """Inputs for Summarization inference
-    Inputs for Text2text Generation inference
-    """
+    """Inputs for Summarization inference"""
 
     inputs: str
-    """The input text data"""
-    parameters: Optional[SummarizationGenerationParameters] = None
-    """Additional inference parameters"""
+    """The input text to summarize."""
+    parameters: Optional[SummarizationParameters] = None
+    """Additional inference parameters."""
 
 
 @dataclass
diff --git a/src/huggingface_hub/inference/_generated/types/text_generation.py b/src/huggingface_hub/inference/_generated/types/text_generation.py
index 27c70c7e2b..5e902600d8 100644
--- a/src/huggingface_hub/inference/_generated/types/text_generation.py
+++ b/src/huggingface_hub/inference/_generated/types/text_generation.py
@@ -142,6 +142,7 @@ class TextGenerationOutput(BaseInferenceType):
 class TextGenerationStreamOutputStreamDetails(BaseInferenceType):
     finish_reason: "TextGenerationOutputFinishReason"
     generated_tokens: int
+    input_length: int
     seed: Optional[int] = None
 
 
diff --git a/src/huggingface_hub/inference/_generated/types/text_to_image.py b/src/huggingface_hub/inference/_generated/types/text_to_image.py
index 40e53ab016..97cbcdeb2c 100644
--- a/src/huggingface_hub/inference/_generated/types/text_to_image.py
+++ b/src/huggingface_hub/inference/_generated/types/text_to_image.py
@@ -24,17 +24,19 @@ class TextToImageParameters(BaseInferenceType):
     """
 
     guidance_scale: Optional[float] = None
-    """For diffusion models. A higher guidance scale value encourages the model to generate
-    images closely linked to the text prompt at the expense of lower image quality.
+    """A higher guidance scale value encourages the model to generate images closely linked to
+    the text prompt, but values too high may cause saturation and other artifacts.
     """
     negative_prompt: Optional[List[str]] = None
     """One or several prompt to guide what NOT to include in image generation."""
     num_inference_steps: Optional[int] = None
-    """For diffusion models. The number of denoising steps. More denoising steps usually lead to
-    a higher quality image at the expense of slower inference.
+    """The number of denoising steps. More denoising steps usually lead to a higher quality
+    image at the expense of slower inference.
     """
     scheduler: Optional[str] = None
-    """For diffusion models. Override the scheduler with a compatible one"""
+    """Override the scheduler with a compatible one."""
+    seed: Optional[int] = None
+    """Seed for the random number generator."""
     target_size: Optional[TextToImageTargetSize] = None
     """The size in pixel of the output image"""
 
@@ -44,7 +46,7 @@ class TextToImageInput(BaseInferenceType):
     """Inputs for Text To Image inference"""
 
     inputs: str
-    """The input text data (sometimes called "prompt\""""
+    """The input text data (sometimes called "prompt")"""
     parameters: Optional[TextToImageParameters] = None
     """Additional inference parameters"""
 
@@ -54,4 +56,4 @@ class TextToImageOutput(BaseInferenceType):
     """Outputs of inference for the Text To Image task"""
 
     image: Any
-    """The generated image"""
+    """The generated image returned as raw bytes in the payload."""
diff --git a/src/huggingface_hub/inference/_generated/types/text_to_speech.py b/src/huggingface_hub/inference/_generated/types/text_to_speech.py
new file mode 100644
index 0000000000..30e0b1d7d8
--- /dev/null
+++ b/src/huggingface_hub/inference/_generated/types/text_to_speech.py
@@ -0,0 +1,107 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from dataclasses import dataclass
+from typing import Any, Literal, Optional, Union
+
+from .base import BaseInferenceType
+
+
+EarlyStoppingEnum = Literal["never"]
+
+
+@dataclass
+class TextToSpeechGenerationParameters(BaseInferenceType):
+    """Parametrization of the text generation process
+    Ad-hoc parametrization of the text generation process
+    """
+
+    do_sample: Optional[bool] = None
+    """Whether to use sampling instead of greedy decoding when generating new tokens."""
+    early_stopping: Optional[Union[bool, "EarlyStoppingEnum"]] = None
+    """Controls the stopping condition for beam-based methods."""
+    epsilon_cutoff: Optional[float] = None
+    """If set to float strictly between 0 and 1, only tokens with a conditional probability
+    greater than epsilon_cutoff will be sampled. In the paper, suggested values range from
+    3e-4 to 9e-4, depending on the size of the model. See [Truncation Sampling as Language
+    Model Desmoothing](https://hf.co/papers/2210.15191) for more details.
+    """
+    eta_cutoff: Optional[float] = None
+    """Eta sampling is a hybrid of locally typical sampling and epsilon sampling. If set to
+    float strictly between 0 and 1, a token is only considered if it is greater than either
+    eta_cutoff or sqrt(eta_cutoff) * exp(-entropy(softmax(next_token_logits))). The latter
+    term is intuitively the expected next token probability, scaled by sqrt(eta_cutoff). In
+    the paper, suggested values range from 3e-4 to 2e-3, depending on the size of the model.
+    See [Truncation Sampling as Language Model Desmoothing](https://hf.co/papers/2210.15191)
+    for more details.
+    """
+    max_length: Optional[int] = None
+    """The maximum length (in tokens) of the generated text, including the input."""
+    max_new_tokens: Optional[int] = None
+    """The maximum number of tokens to generate. Takes precedence over maxLength."""
+    min_length: Optional[int] = None
+    """The minimum length (in tokens) of the generated text, including the input."""
+    min_new_tokens: Optional[int] = None
+    """The minimum number of tokens to generate. Takes precedence over maxLength."""
+    num_beam_groups: Optional[int] = None
+    """Number of groups to divide num_beams into in order to ensure diversity among different
+    groups of beams. See [this paper](https://hf.co/papers/1610.02424) for more details.
+    """
+    num_beams: Optional[int] = None
+    """Number of beams to use for beam search."""
+    penalty_alpha: Optional[float] = None
+    """The value balances the model confidence and the degeneration penalty in contrastive
+    search decoding.
+    """
+    temperature: Optional[float] = None
+    """The value used to modulate the next token probabilities."""
+    top_k: Optional[int] = None
+    """The number of highest probability vocabulary tokens to keep for top-k-filtering."""
+    top_p: Optional[float] = None
+    """If set to float < 1, only the smallest set of most probable tokens with probabilities
+    that add up to top_p or higher are kept for generation.
+    """
+    typical_p: Optional[float] = None
+    """Local typicality measures how similar the conditional probability of predicting a target
+    token next is to the expected conditional probability of predicting a random token next,
+    given the partial text already generated. If set to float < 1, the smallest set of the
+    most locally typical tokens with probabilities that add up to typical_p or higher are
+    kept for generation. See [this paper](https://hf.co/papers/2202.00666) for more details.
+    """
+    use_cache: Optional[bool] = None
+    """Whether the model should use the past last key/values attentions to speed up decoding"""
+
+
+@dataclass
+class TextToSpeechParameters(BaseInferenceType):
+    """Additional inference parameters
+    Additional inference parameters for Text To Speech
+    """
+
+    generate: Optional[TextToSpeechGenerationParameters] = None
+    """Parametrization of the text generation process"""
+
+
+@dataclass
+class TextToSpeechInput(BaseInferenceType):
+    """Inputs for Text To Speech inference"""
+
+    inputs: str
+    """The input text data"""
+    parameters: Optional[TextToSpeechParameters] = None
+    """Additional inference parameters"""
+
+
+@dataclass
+class TextToSpeechOutput(BaseInferenceType):
+    """Outputs for Text to Speech inference
+    Outputs of inference for the Text To Audio task
+    """
+
+    audio: Any
+    """The generated audio waveform."""
+    sampling_rate: Any
+    text_to_speech_output_sampling_rate: Optional[float] = None
+    """The sampling rate of the generated audio waveform."""
diff --git a/src/huggingface_hub/inference/_generated/types/translation.py b/src/huggingface_hub/inference/_generated/types/translation.py
index e06ad2b72d..22e8f5cff0 100644
--- a/src/huggingface_hub/inference/_generated/types/translation.py
+++ b/src/huggingface_hub/inference/_generated/types/translation.py
@@ -9,32 +9,38 @@
 from .base import BaseInferenceType
 
 
-TranslationGenerationTruncationStrategy = Literal["do_not_truncate", "longest_first", "only_first", "only_second"]
+TranslationTruncationStrategy = Literal["do_not_truncate", "longest_first", "only_first", "only_second"]
 
 
 @dataclass
-class TranslationGenerationParameters(BaseInferenceType):
+class TranslationParameters(BaseInferenceType):
     """Additional inference parameters
-    Additional inference parameters for Text2text Generation
+    Additional inference parameters for Translation
     """
 
     clean_up_tokenization_spaces: Optional[bool] = None
     """Whether to clean up the potential extra spaces in the text output."""
     generate_parameters: Optional[Dict[str, Any]] = None
-    """Additional parametrization of the text generation algorithm"""
-    truncation: Optional["TranslationGenerationTruncationStrategy"] = None
-    """The truncation strategy to use"""
+    """Additional parametrization of the text generation algorithm."""
+    src_lang: Optional[str] = None
+    """The source language of the text. Required for models that can translate from multiple
+    languages.
+    """
+    tgt_lang: Optional[str] = None
+    """Target language to translate to. Required for models that can translate to multiple
+    languages.
+    """
+    truncation: Optional["TranslationTruncationStrategy"] = None
+    """The truncation strategy to use."""
 
 
 @dataclass
 class TranslationInput(BaseInferenceType):
-    """Inputs for Translation inference
-    Inputs for Text2text Generation inference
-    """
+    """Inputs for Translation inference"""
 
     inputs: str
-    """The input text data"""
-    parameters: Optional[TranslationGenerationParameters] = None
+    """The text to translate."""
+    parameters: Optional[TranslationParameters] = None
     """Additional inference parameters"""