21
21
# SOFTWARE.
22
22
23
23
import asyncio
24
+ import re
25
+ import time
24
26
from typing import Coroutine , List , Optional , Union
25
27
28
+ import requests
26
29
import torch
27
30
from huggingface_hub import (
28
31
AsyncInferenceClient ,
29
32
InferenceClient ,
30
33
InferenceEndpoint ,
34
+ InferenceEndpointError ,
31
35
InferenceEndpointTimeoutError ,
32
36
TextGenerationInputGrammarType ,
33
37
TextGenerationOutput ,
34
38
create_inference_endpoint ,
35
39
get_inference_endpoint ,
36
40
)
41
+ from huggingface_hub .utils import HfHubHTTPError
42
+ from requests import ConnectionError
37
43
from torch .utils .data import DataLoader
38
44
from tqdm import tqdm
39
45
from transformers import AutoTokenizer
53
59
54
60
55
61
BATCH_SIZE = 50
62
+ MAX_TIME_FOR_SPINUP = 3600
63
+
64
+ SORTED_INSTANCE_SIZES = [ # sorted by incremental overall RAM (to load models)
65
+ # type, size
66
+ ("nvidia-a10g" , "x1" ),
67
+ ("nvidia-t4" , "x4" ),
68
+ ("nvidia-a100" , "x1" ),
69
+ ("nvidia-a10g" , "x4" ),
70
+ ("nvidia-a100" , "x2" ),
71
+ ("nvidia-a100" , "x4" ),
72
+ ]
56
73
57
74
58
75
class InferenceEndpointModel (LightevalModel ):
59
76
"""InferenceEndpointModels can be used both with the free inference client, or with inference
60
77
endpoints, which will use text-generation-inference to deploy your model for the duration of the evaluation.
61
78
"""
62
79
63
- def __init__ (
80
+ def __init__ ( # noqa: C901
64
81
self , config : Union [InferenceEndpointModelConfig , InferenceModelConfig ], env_config : EnvConfig
65
82
) -> None :
66
83
self .reuse_existing = getattr (config , "should_reuse_existing" , True )
67
84
self ._max_length = None
85
+ self .endpoint = None
86
+ self .model_name = None
68
87
if isinstance (config , InferenceEndpointModelConfig ):
69
- if config .should_reuse_existing :
70
- self .endpoint = get_inference_endpoint (
71
- name = config .name , token = env_config .token , namespace = config .namespace
88
+ if config .instance_type and config .instance_size and config .vendor and config .region :
89
+ vendor , region , instance_type , instance_size = (
90
+ config .vendor ,
91
+ config .region ,
92
+ config .instance_type ,
93
+ config .instance_size ,
72
94
)
73
95
else :
74
- self .endpoint : InferenceEndpoint = create_inference_endpoint (
75
- name = config .name ,
76
- namespace = config .namespace ,
77
- repository = config .repository ,
78
- revision = config .revision ,
79
- framework = config .framework ,
80
- task = "text-generation" ,
81
- accelerator = config .accelerator ,
82
- vendor = config .vendor ,
83
- region = config .region ,
84
- type = config .endpoint_type ,
85
- instance_size = config .instance_size ,
86
- instance_type = config .instance_type ,
87
- token = env_config .token ,
88
- custom_image = {
89
- "health_route" : "/health" ,
90
- "env" : {
91
- # Documentaiton: https://huggingface.co/docs/text-generation-inference/en/basic_tutorials/launcher
92
- "MAX_BATCH_PREFILL_TOKENS" : "2048" ,
93
- "MAX_INPUT_LENGTH" : "2047" ,
94
- "MAX_TOTAL_TOKENS" : "2048" ,
95
- "MODEL_ID" : "/repository" ,
96
- "HF_MODEL_TRUST_REMOTE_CODE" : "true" ,
97
- ** config .get_dtype_args (),
98
- ** config .get_custom_env_vars (),
99
- },
100
- "url" : (config .image_url or "ghcr.io/huggingface/text-generation-inference:latest" ),
101
- },
102
- )
103
- hlog ("Deploying your endpoint. Please wait." )
104
- try :
105
- self .endpoint .wait (timeout = 600 ) # Waits for the endpoint to be deployed
106
- except InferenceEndpointTimeoutError as e :
107
- hlog_err ("Endpoint did not start within 10 minutes, there was a timeout." )
108
- raise e
96
+ try :
97
+ vendor , region , instance_type , instance_size = InferenceEndpointModel .get_suggested_model_config (
98
+ config .model_name
99
+ )
100
+ except Exception :
101
+ vendor , region , instance_type , instance_size = (
102
+ "aws" ,
103
+ "us-east-1" ,
104
+ * InferenceEndpointModel .get_larger_hardware_suggestion (),
105
+ )
106
+
107
+ must_scaleup_endpoint = False
108
+ timer_start = time .time ()
109
+ # Endpoint names do not allow special characters
110
+ endpoint_name = config .endpoint_name or re .sub (
111
+ "[^a-zA-Z0-9-]" , "-" , config .model_name .lower () + "-lighteval"
112
+ )
113
+ # If no endpoint or endpoint not running, and we're below an hour
114
+ while (self .endpoint is None or self .endpoint .status != "running" ) and (
115
+ time .time () - timer_start < MAX_TIME_FOR_SPINUP
116
+ ):
117
+ try :
118
+ if self .endpoint is None : # Endpoint does not exist yet locally
119
+ if not config .should_reuse_existing : # New endpoint
120
+ hlog ("Creating endpoint." )
121
+ self .endpoint : InferenceEndpoint = create_inference_endpoint (
122
+ name = endpoint_name ,
123
+ namespace = config .namespace ,
124
+ repository = config .model_name ,
125
+ revision = config .revision ,
126
+ framework = config .framework ,
127
+ task = "text-generation" ,
128
+ accelerator = config .accelerator ,
129
+ type = config .endpoint_type ,
130
+ vendor = vendor ,
131
+ region = region ,
132
+ instance_size = instance_size ,
133
+ instance_type = instance_type ,
134
+ token = env_config .token ,
135
+ custom_image = {
136
+ "health_route" : "/health" ,
137
+ "env" : {
138
+ # Documentation: https://huggingface.co/docs/text-generation-inference/en/basic_tutorials/launcher
139
+ "MAX_BATCH_PREFILL_TOKENS" : "2048" ,
140
+ "MAX_INPUT_LENGTH" : "2047" ,
141
+ "MAX_TOTAL_TOKENS" : "2048" ,
142
+ "MODEL_ID" : "/repository" ,
143
+ "HF_MODEL_TRUST_REMOTE_CODE" : "true" ,
144
+ ** config .get_dtype_args (),
145
+ ** config .get_custom_env_vars (),
146
+ },
147
+ "url" : (
148
+ config .image_url or "ghcr.io/huggingface/text-generation-inference:latest"
149
+ ),
150
+ },
151
+ )
152
+ else : # Endpoint exists
153
+ hlog ("Reusing existing endpoint." )
154
+ self .endpoint = get_inference_endpoint (
155
+ name = endpoint_name , token = env_config .token , namespace = config .namespace
156
+ )
157
+
158
+ else :
159
+ # Endpoint exists locally but either failed (and most likely it must be scaled up)
160
+ if must_scaleup_endpoint :
161
+ hlog ("Rescaling existing endpoint." )
162
+ self .endpoint .update (instance_size = instance_size , instance_type = instance_type )
163
+ must_scaleup_endpoint = False
164
+ # or we got a connection error, in which case we do nothing and just wait at the next step
165
+
166
+ # Waits for the endpoint to be deployed - we could also check for the status in updating', 'pending', 'initializing'
167
+ hlog ("Trying to deploy your endpoint. Please wait for 10 min." )
168
+ self .endpoint .wait (timeout = 600 , refresh_every = 60 ) # We wait for 10 min
169
+ except InferenceEndpointError as e :
170
+ instance_type , instance_size = InferenceEndpointModel .get_larger_hardware_suggestion (
171
+ instance_type , instance_size
172
+ )
173
+ must_scaleup_endpoint = True
174
+
175
+ hlog (
176
+ f"Endpoint failed to start on current hardware with error { e } . Trying to autoscale to ({ instance_type } , { instance_size } )."
177
+ )
178
+ except InferenceEndpointTimeoutError as e :
179
+ hlog_err ("Endpoint did not start within 30 minutes, there was a timeout. Please inspect the logs." )
180
+ raise e
181
+ except HfHubHTTPError as e :
182
+ # The endpoint actually already exists, we'll spin it up instead of trying to create a new one
183
+ if "409 Client Error: Conflict for url:" in str (e ):
184
+ config .endpoint_name = endpoint_name
185
+ config .should_reuse_existing = True
186
+ # Requested resources are not available
187
+ elif "Bad Request: Compute instance not available yet" in str (e ):
188
+ hlog_err (
189
+ "The hardware combination you are requesting does not seem to be available: ({instance_type}, {instance_size}, {config.region})."
190
+ )
191
+ raise e
192
+ # User account does not have access to requested resources
193
+ elif "Conflict: Quota exceeded" in str (e ):
194
+ raise e
195
+ except ConnectionError as e :
196
+ hlog_err (f"Connection failed with error { e } . Retrying" )
197
+
198
+ if not self .endpoint .status == "running" :
199
+ raise Exception ("Did not manage to start endpoint within the elapsed time and on suggested hardware." )
200
+
109
201
hlog ("Endpoint successfully deployed!" )
110
- self .name = config .repository
202
+ self .endpoint_name = config .endpoint_name
203
+ self .name = self .endpoint .repository
111
204
self .revision = self .endpoint .revision
112
205
self .async_client : AsyncInferenceClient = self .endpoint .async_client
113
206
self .client : InferenceClient = self .endpoint .client
114
207
115
208
else : # Free inference client
116
209
self .endpoint = None
210
+ self .endpoint_name = None
117
211
self .name = config .model
118
212
self .revision = "default"
119
213
self .async_client = AsyncInferenceClient (model = config .model , token = env_config .token )
@@ -131,6 +225,43 @@ def __init__(
131
225
model_size = - 1 ,
132
226
)
133
227
228
+ @staticmethod
229
+ def get_larger_hardware_suggestion (cur_instance_type : str = None , cur_instance_size : str = None ):
230
+ cur_instance_ix = - 1
231
+ try :
232
+ if cur_instance_type and cur_instance_size :
233
+ cur_instance_ix = SORTED_INSTANCE_SIZES .index ((cur_instance_type , cur_instance_size ))
234
+ new_instance_type = SORTED_INSTANCE_SIZES [cur_instance_ix + 1 ][0 ]
235
+ new_instance_size = SORTED_INSTANCE_SIZES [cur_instance_ix + 1 ][1 ]
236
+ return new_instance_type , new_instance_size
237
+ except ValueError :
238
+ raise Exception (
239
+ f"Problem when scaling endpoint: the current instance combination ({ cur_instance_type } , { cur_instance_size } ) is unknown. Can't scale it up."
240
+ )
241
+ except IndexError :
242
+ raise Exception (
243
+ "To avoid accidental costs, we do not upgrade the current endpoint above 4 a100 automatically, please request it explicitely."
244
+ )
245
+
246
+ @staticmethod
247
+ def get_suggested_model_config (model_repo ):
248
+ # Code from https://huggingface.co/spaces/huggingface/dedicated-endpoint-snooper/blob/main/app.py
249
+ # Example of the suggestedCompute value: 'aws-us-east-1-nvidia-l4-x1'
250
+ # -> aws us-east-1 nvidia-l4 x1
251
+ url = f"https://ui.endpoints.huggingface.co/api/configuration?model_id={ model_repo } "
252
+ response = requests .get (url )
253
+ config = response .json ()
254
+
255
+ suggested_compute = config ["suggestedCompute" ]
256
+ suggested_vendor = suggested_compute .split ("-" )[0 ]
257
+ if suggested_vendor == "azure" :
258
+ suggested_region = suggested_compute .split ("-" )[1 ]
259
+ else :
260
+ suggested_region = "-" .join (suggested_compute .split ("-" )[1 :4 ])
261
+ suggested_instance = "-" .join (suggested_compute .split ("-" )[- 3 :- 1 ])
262
+ suggested_size = suggested_compute .split ("-" )[- 1 ]
263
+ return suggested_vendor , suggested_region , suggested_instance , suggested_size
264
+
134
265
@property
135
266
def tokenizer (self ):
136
267
return self ._tokenizer
@@ -144,11 +275,17 @@ def disable_tqdm(self) -> bool:
144
275
False # no accelerator = this is the main process
145
276
146
277
def cleanup (self ):
147
- if self .endpoint is not None and not self .reuse_existing :
148
- self .endpoint .delete ()
149
- hlog_warn (
150
- "You deleted your endpoint after using it. You'll need to create it again if you need to reuse it."
151
- )
278
+ if self .endpoint is not None :
279
+ if self .reuse_existing :
280
+ self .endpoint .pause ()
281
+ hlog_warn (
282
+ "Since your endpoint was existing before, we did not delete it, but paused it instead. You might want to delete it if you're done using it."
283
+ )
284
+ else :
285
+ self .endpoint .delete ()
286
+ hlog_warn (
287
+ "We deleted the spinned up endpoint after using it. You'll need to create it again if you need to reuse it."
288
+ )
152
289
153
290
@property
154
291
def max_length (self ):
0 commit comments