@@ -59,6 +59,7 @@ def create(
5959 presence_penalty : Optional [float ] | NotGiven = NOT_GIVEN ,
6060 response_format : completion_create_params .ResponseFormat | NotGiven = NOT_GIVEN ,
6161 seed : Optional [int ] | NotGiven = NOT_GIVEN ,
62+ service_tier : Optional [Literal ["auto" , "default" ]] | NotGiven = NOT_GIVEN ,
6263 stop : Union [Optional [str ], List [str ]] | NotGiven = NOT_GIVEN ,
6364 stream : Optional [Literal [False ]] | NotGiven = NOT_GIVEN ,
6465 stream_options : Optional [ChatCompletionStreamOptionsParam ] | NotGiven = NOT_GIVEN ,
@@ -163,6 +164,16 @@ def create(
163164 should refer to the `system_fingerprint` response parameter to monitor changes
164165 in the backend.
165166
167+ service_tier: Specifies the latency tier to use for processing the request. This parameter is
168+ relevant for customers subscribed to the scale tier service:
169+
170+ - If set to 'auto', the system will utilize scale tier credits until they are
171+ exhausted.
172+ - If set to 'default', the request will be processed in the shared cluster.
173+
174+ When this parameter is set, the response body will include the `service_tier`
175+ utilized.
176+
166177 stop: Up to 4 sequences where the API will stop generating further tokens.
167178
168179 stream: If set, partial message deltas will be sent, like in ChatGPT. Tokens will be
@@ -236,6 +247,7 @@ def create(
236247 presence_penalty : Optional [float ] | NotGiven = NOT_GIVEN ,
237248 response_format : completion_create_params .ResponseFormat | NotGiven = NOT_GIVEN ,
238249 seed : Optional [int ] | NotGiven = NOT_GIVEN ,
250+ service_tier : Optional [Literal ["auto" , "default" ]] | NotGiven = NOT_GIVEN ,
239251 stop : Union [Optional [str ], List [str ]] | NotGiven = NOT_GIVEN ,
240252 stream_options : Optional [ChatCompletionStreamOptionsParam ] | NotGiven = NOT_GIVEN ,
241253 temperature : Optional [float ] | NotGiven = NOT_GIVEN ,
@@ -346,6 +358,16 @@ def create(
346358 should refer to the `system_fingerprint` response parameter to monitor changes
347359 in the backend.
348360
361+ service_tier: Specifies the latency tier to use for processing the request. This parameter is
362+ relevant for customers subscribed to the scale tier service:
363+
364+ - If set to 'auto', the system will utilize scale tier credits until they are
365+ exhausted.
366+ - If set to 'default', the request will be processed in the shared cluster.
367+
368+ When this parameter is set, the response body will include the `service_tier`
369+ utilized.
370+
349371 stop: Up to 4 sequences where the API will stop generating further tokens.
350372
351373 stream_options: Options for streaming response. Only set this when you set `stream: true`.
@@ -412,6 +434,7 @@ def create(
412434 presence_penalty : Optional [float ] | NotGiven = NOT_GIVEN ,
413435 response_format : completion_create_params .ResponseFormat | NotGiven = NOT_GIVEN ,
414436 seed : Optional [int ] | NotGiven = NOT_GIVEN ,
437+ service_tier : Optional [Literal ["auto" , "default" ]] | NotGiven = NOT_GIVEN ,
415438 stop : Union [Optional [str ], List [str ]] | NotGiven = NOT_GIVEN ,
416439 stream_options : Optional [ChatCompletionStreamOptionsParam ] | NotGiven = NOT_GIVEN ,
417440 temperature : Optional [float ] | NotGiven = NOT_GIVEN ,
@@ -522,6 +545,16 @@ def create(
522545 should refer to the `system_fingerprint` response parameter to monitor changes
523546 in the backend.
524547
548+ service_tier: Specifies the latency tier to use for processing the request. This parameter is
549+ relevant for customers subscribed to the scale tier service:
550+
551+ - If set to 'auto', the system will utilize scale tier credits until they are
552+ exhausted.
553+ - If set to 'default', the request will be processed in the shared cluster.
554+
555+ When this parameter is set, the response body will include the `service_tier`
556+ utilized.
557+
525558 stop: Up to 4 sequences where the API will stop generating further tokens.
526559
527560 stream_options: Options for streaming response. Only set this when you set `stream: true`.
@@ -587,6 +620,7 @@ def create(
587620 presence_penalty : Optional [float ] | NotGiven = NOT_GIVEN ,
588621 response_format : completion_create_params .ResponseFormat | NotGiven = NOT_GIVEN ,
589622 seed : Optional [int ] | NotGiven = NOT_GIVEN ,
623+ service_tier : Optional [Literal ["auto" , "default" ]] | NotGiven = NOT_GIVEN ,
590624 stop : Union [Optional [str ], List [str ]] | NotGiven = NOT_GIVEN ,
591625 stream : Optional [Literal [False ]] | Literal [True ] | NotGiven = NOT_GIVEN ,
592626 stream_options : Optional [ChatCompletionStreamOptionsParam ] | NotGiven = NOT_GIVEN ,
@@ -620,6 +654,7 @@ def create(
620654 "presence_penalty" : presence_penalty ,
621655 "response_format" : response_format ,
622656 "seed" : seed ,
657+ "service_tier" : service_tier ,
623658 "stop" : stop ,
624659 "stream" : stream ,
625660 "stream_options" : stream_options ,
@@ -667,6 +702,7 @@ async def create(
667702 presence_penalty : Optional [float ] | NotGiven = NOT_GIVEN ,
668703 response_format : completion_create_params .ResponseFormat | NotGiven = NOT_GIVEN ,
669704 seed : Optional [int ] | NotGiven = NOT_GIVEN ,
705+ service_tier : Optional [Literal ["auto" , "default" ]] | NotGiven = NOT_GIVEN ,
670706 stop : Union [Optional [str ], List [str ]] | NotGiven = NOT_GIVEN ,
671707 stream : Optional [Literal [False ]] | NotGiven = NOT_GIVEN ,
672708 stream_options : Optional [ChatCompletionStreamOptionsParam ] | NotGiven = NOT_GIVEN ,
@@ -771,6 +807,16 @@ async def create(
771807 should refer to the `system_fingerprint` response parameter to monitor changes
772808 in the backend.
773809
810+ service_tier: Specifies the latency tier to use for processing the request. This parameter is
811+ relevant for customers subscribed to the scale tier service:
812+
813+ - If set to 'auto', the system will utilize scale tier credits until they are
814+ exhausted.
815+ - If set to 'default', the request will be processed in the shared cluster.
816+
817+ When this parameter is set, the response body will include the `service_tier`
818+ utilized.
819+
774820 stop: Up to 4 sequences where the API will stop generating further tokens.
775821
776822 stream: If set, partial message deltas will be sent, like in ChatGPT. Tokens will be
@@ -844,6 +890,7 @@ async def create(
844890 presence_penalty : Optional [float ] | NotGiven = NOT_GIVEN ,
845891 response_format : completion_create_params .ResponseFormat | NotGiven = NOT_GIVEN ,
846892 seed : Optional [int ] | NotGiven = NOT_GIVEN ,
893+ service_tier : Optional [Literal ["auto" , "default" ]] | NotGiven = NOT_GIVEN ,
847894 stop : Union [Optional [str ], List [str ]] | NotGiven = NOT_GIVEN ,
848895 stream_options : Optional [ChatCompletionStreamOptionsParam ] | NotGiven = NOT_GIVEN ,
849896 temperature : Optional [float ] | NotGiven = NOT_GIVEN ,
@@ -954,6 +1001,16 @@ async def create(
9541001 should refer to the `system_fingerprint` response parameter to monitor changes
9551002 in the backend.
9561003
1004+ service_tier: Specifies the latency tier to use for processing the request. This parameter is
1005+ relevant for customers subscribed to the scale tier service:
1006+
1007+ - If set to 'auto', the system will utilize scale tier credits until they are
1008+ exhausted.
1009+ - If set to 'default', the request will be processed in the shared cluster.
1010+
1011+ When this parameter is set, the response body will include the `service_tier`
1012+ utilized.
1013+
9571014 stop: Up to 4 sequences where the API will stop generating further tokens.
9581015
9591016 stream_options: Options for streaming response. Only set this when you set `stream: true`.
@@ -1020,6 +1077,7 @@ async def create(
10201077 presence_penalty : Optional [float ] | NotGiven = NOT_GIVEN ,
10211078 response_format : completion_create_params .ResponseFormat | NotGiven = NOT_GIVEN ,
10221079 seed : Optional [int ] | NotGiven = NOT_GIVEN ,
1080+ service_tier : Optional [Literal ["auto" , "default" ]] | NotGiven = NOT_GIVEN ,
10231081 stop : Union [Optional [str ], List [str ]] | NotGiven = NOT_GIVEN ,
10241082 stream_options : Optional [ChatCompletionStreamOptionsParam ] | NotGiven = NOT_GIVEN ,
10251083 temperature : Optional [float ] | NotGiven = NOT_GIVEN ,
@@ -1130,6 +1188,16 @@ async def create(
11301188 should refer to the `system_fingerprint` response parameter to monitor changes
11311189 in the backend.
11321190
1191+ service_tier: Specifies the latency tier to use for processing the request. This parameter is
1192+ relevant for customers subscribed to the scale tier service:
1193+
1194+ - If set to 'auto', the system will utilize scale tier credits until they are
1195+ exhausted.
1196+ - If set to 'default', the request will be processed in the shared cluster.
1197+
1198+ When this parameter is set, the response body will include the `service_tier`
1199+ utilized.
1200+
11331201 stop: Up to 4 sequences where the API will stop generating further tokens.
11341202
11351203 stream_options: Options for streaming response. Only set this when you set `stream: true`.
@@ -1195,6 +1263,7 @@ async def create(
11951263 presence_penalty : Optional [float ] | NotGiven = NOT_GIVEN ,
11961264 response_format : completion_create_params .ResponseFormat | NotGiven = NOT_GIVEN ,
11971265 seed : Optional [int ] | NotGiven = NOT_GIVEN ,
1266+ service_tier : Optional [Literal ["auto" , "default" ]] | NotGiven = NOT_GIVEN ,
11981267 stop : Union [Optional [str ], List [str ]] | NotGiven = NOT_GIVEN ,
11991268 stream : Optional [Literal [False ]] | Literal [True ] | NotGiven = NOT_GIVEN ,
12001269 stream_options : Optional [ChatCompletionStreamOptionsParam ] | NotGiven = NOT_GIVEN ,
@@ -1228,6 +1297,7 @@ async def create(
12281297 "presence_penalty" : presence_penalty ,
12291298 "response_format" : response_format ,
12301299 "seed" : seed ,
1300+ "service_tier" : service_tier ,
12311301 "stop" : stop ,
12321302 "stream" : stream ,
12331303 "stream_options" : stream_options ,
0 commit comments