32
32
},
33
33
}
34
34
35
+
35
36
def get_url (version , sku_id = None ):
36
37
if version == 'v2beta/skus' :
37
38
return urls [version ]['BASE_URL' ] + urls [version ]['QUERY_STRING' ]
@@ -40,6 +41,7 @@ def get_url(version, sku_id=None):
40
41
else :
41
42
return None
42
43
44
+
43
45
# Get authentication token
44
46
def get_access_token ():
45
47
try :
@@ -53,6 +55,7 @@ def get_access_token():
53
55
send_slack_message (f"[GCP Collector]\n Error in get_access_token: { str (e )} " )
54
56
raise
55
57
58
+
56
59
# str1: GPU model name string (e.g. nvidia-h100-mega-80gb)
57
60
# str2: List of GPU model names to compare similarity with (e.g. ['A100', 'H100', 'L4'])
58
61
def jaccard_similarity (str1 , str2 ):
@@ -77,6 +80,7 @@ def jaccard_similarity(str1, str2):
77
80
max_str = str
78
81
return max_str
79
82
83
+
80
84
def call_api (version = None , sku_id = None , page_token = None ):
81
85
token = get_access_token ()
82
86
headers = {"Authorization" : f"Bearer { token } " }
@@ -97,6 +101,7 @@ def call_api(version=None, sku_id=None, page_token=None):
97
101
send_slack_message (error_msg )
98
102
raise
99
103
104
+
100
105
# Get SKU information
101
106
def get_sku_infos (response ):
102
107
try :
@@ -106,26 +111,26 @@ def get_sku_infos(response):
106
111
for sku in skus :
107
112
info_type = None
108
113
if (len (sku ['productTaxonomy' ]['taxonomyCategories' ]) == 6 and
109
- sku ['productTaxonomy' ]['taxonomyCategories' ][0 ]['category' ] == 'GCP' and
110
- sku ['productTaxonomy' ]['taxonomyCategories' ][1 ]['category' ] == 'Compute' and
111
- sku ['productTaxonomy' ]['taxonomyCategories' ][2 ]['category' ] == 'GCE' and
112
- (sku ['productTaxonomy' ]['taxonomyCategories' ][3 ]['category' ] == 'VMs Preemptible' or sku ['productTaxonomy' ]['taxonomyCategories' ][3 ]['category' ] == 'VMs On Demand' ) and
114
+ sku ['productTaxonomy' ]['taxonomyCategories' ][0 ]['category' ] == 'GCP' and
115
+ sku ['productTaxonomy' ]['taxonomyCategories' ][1 ]['category' ] == 'Compute' and
116
+ sku ['productTaxonomy' ]['taxonomyCategories' ][2 ]['category' ] == 'GCE' and
117
+ (sku ['productTaxonomy' ]['taxonomyCategories' ][3 ]['category' ] == 'VMs Preemptible' or sku ['productTaxonomy' ]['taxonomyCategories' ][3 ]['category' ] == 'VMs On Demand' ) and
113
118
(sku ['productTaxonomy' ]['taxonomyCategories' ][4 ]['category' ] == 'Memory: Per GB' or sku ['productTaxonomy' ]['taxonomyCategories' ][4 ]['category' ] == 'Cores: Per Core' or sku ['productTaxonomy' ]['taxonomyCategories' ][4 ]['category' ] == 'Cores: 1 to 64' ) and
114
119
'Custom' not in sku ['displayName' ] and
115
120
'Sole Tenancy' not in sku ['displayName' ] and
116
- sku ['productTaxonomy' ]['taxonomyCategories' ][5 ]['category' ] != 'Cross VM' ):
121
+ sku ['productTaxonomy' ]['taxonomyCategories' ][5 ]['category' ] != 'Cross VM' ):
117
122
info_type = "VMs"
118
123
elif (len (sku ['productTaxonomy' ]['taxonomyCategories' ]) == 5 and
119
- sku ['productTaxonomy' ]['taxonomyCategories' ][0 ]['category' ] == 'GCP' and
120
- sku ['productTaxonomy' ]['taxonomyCategories' ][1 ]['category' ] == 'Compute' and
121
- sku ['productTaxonomy' ]['taxonomyCategories' ][2 ]['category' ] == 'GPUs' and
122
- (sku ['productTaxonomy' ]['taxonomyCategories' ][3 ]['category' ] == 'GPUs Preemptible' or sku ['productTaxonomy' ]['taxonomyCategories' ][3 ]['category' ] == 'GPUs On Demand' )):
124
+ sku ['productTaxonomy' ]['taxonomyCategories' ][0 ]['category' ] == 'GCP' and
125
+ sku ['productTaxonomy' ]['taxonomyCategories' ][1 ]['category' ] == 'Compute' and
126
+ sku ['productTaxonomy' ]['taxonomyCategories' ][2 ]['category' ] == 'GPUs' and
127
+ (sku ['productTaxonomy' ]['taxonomyCategories' ][3 ]['category' ] == 'GPUs Preemptible' or sku ['productTaxonomy' ]['taxonomyCategories' ][3 ]['category' ] == 'GPUs On Demand' )):
123
128
info_type = "GPUs"
124
129
elif (len (sku ['productTaxonomy' ]['taxonomyCategories' ]) == 6 and
125
- sku ['productTaxonomy' ]['taxonomyCategories' ][0 ]['category' ] == 'GCP' and
126
- sku ['productTaxonomy' ]['taxonomyCategories' ][1 ]['category' ] == 'Compute' and
127
- sku ['productTaxonomy' ]['taxonomyCategories' ][2 ]['category' ] == 'GPUs' and
128
- (sku ['productTaxonomy' ]['taxonomyCategories' ][3 ]['category' ] == 'GPUs Preemptible' or sku ['productTaxonomy' ]['taxonomyCategories' ][3 ]['category' ] == 'GPUs On Demand' )):
130
+ sku ['productTaxonomy' ]['taxonomyCategories' ][0 ]['category' ] == 'GCP' and
131
+ sku ['productTaxonomy' ]['taxonomyCategories' ][1 ]['category' ] == 'Compute' and
132
+ sku ['productTaxonomy' ]['taxonomyCategories' ][2 ]['category' ] == 'GPUs' and
133
+ (sku ['productTaxonomy' ]['taxonomyCategories' ][3 ]['category' ] == 'GPUs Preemptible' or sku ['productTaxonomy' ]['taxonomyCategories' ][3 ]['category' ] == 'GPUs On Demand' )):
129
134
info_type = "GPUs_with_Core_and_Memory"
130
135
else :
131
136
continue
@@ -192,6 +197,8 @@ def get_sku_infos(response):
192
197
gpu_type = sku ['productTaxonomy' ]['taxonomyCategories' ][4 ]['category' ]
193
198
price_model = "On-demand" if sku ['productTaxonomy' ]['taxonomyCategories' ][3 ]['category' ] == 'GPUs On Demand' else "Preemptible"
194
199
price_resource = sku ['productTaxonomy' ]['taxonomyCategories' ][5 ]['category' ].split (":" )[0 ]
200
+ if price_resource == 'GPU slice' : # Skip GPU slice resource
201
+ continue
195
202
if sku ['geoTaxonomy' ]['type' ] == 'TYPE_REGIONAL' :
196
203
region = sku ['geoTaxonomy' ]['regionalMetadata' ]['region' ]['region' ]
197
204
gpu_sku_infos .append ({
@@ -221,6 +228,7 @@ def get_sku_infos(response):
221
228
send_slack_message (error_msg )
222
229
raise
223
230
231
+
224
232
def get_price_infos (response , sku_ids , gpu_sku_ids ):
225
233
try :
226
234
prices = response ['prices' ]
@@ -231,7 +239,8 @@ def get_price_infos(response, sku_ids, gpu_sku_ids):
231
239
if sku_id in sku_ids :
232
240
price_value = None
233
241
try :
234
- price_value = int (price ['rate' ]['tiers' ][0 ]['listPrice' ]['units' ]) + price ['rate' ]['tiers' ][0 ]['listPrice' ]['nanos' ] * 0.000000001
242
+ price_value = int (price ['rate' ]['tiers' ][0 ]['listPrice' ]['units' ]) + \
243
+ price ['rate' ]['tiers' ][0 ]['listPrice' ]['nanos' ] * 0.000000001
235
244
except :
236
245
price_value = price ['rate' ]['tiers' ][0 ]['listPrice' ]['nanos' ] * 0.000000001
237
246
price_infos .append ({
@@ -246,7 +255,8 @@ def get_price_infos(response, sku_ids, gpu_sku_ids):
246
255
price_value = None
247
256
try :
248
257
try :
249
- price_value = int (price ['rate' ]['tiers' ][0 ]['listPrice' ]['units' ]) + price ['rate' ]['tiers' ][0 ]['listPrice' ]['nanos' ] * 0.000000001
258
+ price_value = int (price ['rate' ]['tiers' ][0 ]['listPrice' ]['units' ]) + \
259
+ price ['rate' ]['tiers' ][0 ]['listPrice' ]['nanos' ] * 0.000000001
250
260
except :
251
261
price_value = price ['rate' ]['tiers' ][0 ]['listPrice' ]['nanos' ] * 0.000000001
252
262
except :
@@ -264,20 +274,21 @@ def get_price_infos(response, sku_ids, gpu_sku_ids):
264
274
send_slack_message (f"[GCP Collector]\n KeyError in get_price_infos: { str (e )} " )
265
275
raise
266
276
277
+
267
278
def list_regions_and_machine_types (gpu_families ):
268
279
try :
269
280
# Create Compute Engine API client (using JSON key file)
270
281
client = compute_v1 .RegionsClient .from_service_account_file (SERVICE_ACCOUNT_FILE )
271
282
machine_types_client = compute_v1 .MachineTypesClient .from_service_account_file (SERVICE_ACCOUNT_FILE )
272
-
283
+
273
284
# Get project ID (read from JSON file)
274
285
with open (SERVICE_ACCOUNT_FILE , 'r' ) as f :
275
286
import json
276
287
project_id = json .load (f )['project_id' ]
277
-
288
+
278
289
# Get all regions
279
290
regions = client .list (project = project_id )
280
-
291
+
281
292
# Save results
282
293
region_machine_types = []
283
294
@@ -286,7 +297,7 @@ def list_regions_and_machine_types(gpu_families):
286
297
# Get machine types for each region
287
298
for region in regions :
288
299
zone_list = list_zones_in_region (region .name , project_id )
289
-
300
+
290
301
for zone in zone_list :
291
302
machine_types = machine_types_client .list (project = project_id , zone = zone )
292
303
for machine_type in machine_types :
@@ -307,28 +318,30 @@ def list_regions_and_machine_types(gpu_families):
307
318
"gpuCount" : gpu_count ,
308
319
"gpuType" : gpu_type ,
309
320
})
310
-
321
+
311
322
return region_machine_types
312
323
except Exception as e :
313
324
send_slack_message (f"[GCP Collector]\n Error in list_regions_and_machine_types: { str (e )} " )
314
325
raise
315
326
327
+
316
328
def list_zones_in_region (region_name , project_id ):
317
329
try :
318
330
"""
319
331
Get all available zones in the given region
320
332
"""
321
333
zones_client = compute_v1 .ZonesClient .from_service_account_file (SERVICE_ACCOUNT_FILE )
322
334
zones = zones_client .list (project = project_id )
323
-
335
+
324
336
return [
325
- zone .name for zone in zones
337
+ zone .name for zone in zones
326
338
if zone .name .startswith (region_name )
327
339
]
328
340
except Exception as e :
329
341
send_slack_message (f"[GCP Collector]\n Error in list_zones_in_region: { str (e )} " )
330
342
raise
331
343
344
+
332
345
# Define price calculation function
333
346
def calculate_price (row , cores_key , memory_key , gpu_key ):
334
347
cores_price = row [cores_key ]
@@ -344,10 +357,11 @@ def calculate_price(row, cores_key, memory_key, gpu_key):
344
357
gpu_price = 0 # Treat as 0 if only GPU price is missing
345
358
return max (row ["vcpus" ], 1 ) * cores_price + row ["memory" ] * memory_price + row ["gpuCount" ] * gpu_price
346
359
360
+
347
361
def upload_cloudwatch (df_current , timestamp ):
348
362
ondemand_count = len (df_current .drop (columns = ['Spot Price' , 'Savings' ]).dropna ())
349
363
spot_count = len (df_current .drop (columns = ['OnDemand Price' , 'Savings' ]).dropna ())
350
-
364
+
351
365
cw_client = boto3 .client ('logs' )
352
366
353
367
log_event = {
@@ -357,16 +371,17 @@ def upload_cloudwatch(df_current, timestamp):
357
371
358
372
cw_client .put_log_events (
359
373
logGroupName = GCP_CONST .SPOT_DATA_COLLECTION_LOG_GROUP_NAME ,
360
- logStreamName = GCP_CONST .LOG_STREAM_NAME ,
374
+ logStreamName = GCP_CONST .LOG_STREAM_NAME ,
361
375
logEvents = [log_event ]
362
376
)
363
377
378
+
364
379
def lambda_handler (event , context ):
365
380
try :
366
381
start_time = time .time ()
367
382
str_datetime = datetime .now (timezone .utc ).strftime ("%Y-%m-%dT%H:%M" )
368
383
timestamp = datetime .strptime (str_datetime , "%Y-%m-%dT%H:%M" )
369
-
384
+
370
385
response = call_api (version = 'v2beta/skus' )
371
386
sku_infos , gpu_sku_infos = get_sku_infos (response )
372
387
while 'nextPageToken' in response :
@@ -376,9 +391,11 @@ def lambda_handler(event, context):
376
391
gpu_sku_infos += new_gpu_sku_infos
377
392
print ("Complete to get sku_infos" )
378
393
379
- sku_df = pd .DataFrame (sku_infos ).sort_values (by = ["machineFamily" , "region" , "priceModel" , "priceResource" ], ascending = True ).reset_index (drop = True )
394
+ sku_df = pd .DataFrame (sku_infos ).sort_values (
395
+ by = ["machineFamily" , "region" , "priceModel" , "priceResource" ], ascending = True ).reset_index (drop = True )
380
396
381
- gpu_sku_df = pd .DataFrame (gpu_sku_infos ).sort_values (by = ["gpuType" , "region" , "priceModel" , "priceResource" ], ascending = True ).reset_index (drop = True )
397
+ gpu_sku_df = pd .DataFrame (gpu_sku_infos ).sort_values (
398
+ by = ["gpuType" , "region" , "priceModel" , "priceResource" ], ascending = True ).reset_index (drop = True )
382
399
383
400
sku_ids = set ([sku_info ['skuId' ] for sku_info in sku_infos ])
384
401
gpu_sku_ids = set ([gpu_sku_info ['skuId' ] for gpu_sku_info in gpu_sku_infos ])
@@ -404,7 +421,8 @@ def lambda_handler(event, context):
404
421
405
422
machine_types_infos = list_regions_and_machine_types (list (gpu_df ['gpuType' ].unique ()))
406
423
407
- machine_types_df = pd .DataFrame (machine_types_infos ).sort_values (by = ["machineFamily" , "machineType" , "region" , "vcpus" , "memory" ], ascending = True ).reset_index (drop = True )
424
+ machine_types_df = pd .DataFrame (machine_types_infos ).sort_values (
425
+ by = ["machineFamily" , "machineType" , "region" , "vcpus" , "memory" ], ascending = True ).reset_index (drop = True )
408
426
machine_types_df ['machineModel' ] = 'Standard'
409
427
410
428
# DataFrame transformation code
@@ -464,10 +482,13 @@ def lambda_handler(event, context):
464
482
df_final [col ] = df_final [col ].fillna (df_final [f"{ col } _new" ])
465
483
466
484
# Remove temporary columns with '_new' suffix
467
- df_final = df_final .drop (columns = [f"{ col } _new" for col in ["ondemandCorePrice" , "ondemandMemoryPrice" , "preemptibleCorePrice" , "preemptibleMemoryPrice" ]])
485
+ df_final = df_final .drop (columns = [f"{ col } _new" for col in ["ondemandCorePrice" ,
486
+ "ondemandMemoryPrice" , "preemptibleCorePrice" , "preemptibleMemoryPrice" ]])
468
487
469
- df_final ['ondemandPrice' ] = df_final .apply (lambda row : calculate_price (row , "ondemandCorePrice" , "ondemandMemoryPrice" , "ondemandGPUPrice" ), axis = 1 )
470
- df_final ['preemptiblePrice' ] = df_final .apply (lambda row : calculate_price (row , "preemptibleCorePrice" , "preemptibleMemoryPrice" , "preemptibleGPUPrice" ), axis = 1 )
488
+ df_final ['ondemandPrice' ] = df_final .apply (lambda row : calculate_price (
489
+ row , "ondemandCorePrice" , "ondemandMemoryPrice" , "ondemandGPUPrice" ), axis = 1 )
490
+ df_final ['preemptiblePrice' ] = df_final .apply (lambda row : calculate_price (
491
+ row , "preemptibleCorePrice" , "preemptibleMemoryPrice" , "preemptibleGPUPrice" ), axis = 1 )
471
492
472
493
# Construct final DataFrame
473
494
df_final ['Time' ] = timestamp .strftime ("%Y-%m-%d %H:%M:%S" )
@@ -520,5 +541,6 @@ def lambda_handler(event, context):
520
541
send_slack_message (f"[GCP Collector]\n Unhandled exception in main: { str (e )} " )
521
542
raise
522
543
544
+
523
545
if __name__ == "__main__" :
524
546
lambda_handler ({}, {})
0 commit comments