df 필드 수정, df 처리 프로세스 수정

krtaiyang · krtaiyang · commit 0aa45cffe2ed · 2025-02-09T13:12:32.000+09:00
diff --git a/collector/spot-dataset/azure/lambda/current_collector/lambda_function_sps.py b/collector/spot-dataset/azure/lambda/current_collector/lambda_function_sps.py
@@ -32,20 +32,15 @@ def lambda_handler(event, _):
                 return handle_response(200, "Executed successfully. Scheduled time skipped.", action, event_time_utc)
 
             sps_res_df = load_sps.collect_spot_placement_score(desired_count=desired_count)
+
         else:
             raise ValueError(f"Invalid lambda action.")
 
 
-        # price_if_df = S3.read_file(AZURE_CONST.S3_LATEST_PRICE_IF_GZIP_SAVE_PATH, 'pkl.gz')
-        price_if_df = pd.DataFrame(S3.read_file(AZURE_CONST.LATEST_FILENAME, 'json'))
-        price_eviction_sps_df = merge_price_eviction_sps_df(price_if_df, sps_res_df)
-
         if sps_res_df is None: raise ValueError("sps_res_df is None")
-        if price_if_df is None: raise ValueError("price_if_df is None")
-        if price_eviction_sps_df is None: raise ValueError("price_eviction_sps_df is None")
 
-        if not update_and_save_res_df(price_eviction_sps_df, event_time_utc):
-            raise RuntimeError("Failed to update or save price_eviction_sps_df data")
+        if not handle_res_df(sps_res_df, event_time_utc):
+            raise RuntimeError("Failed to handle_res_df")
 
         return handle_response(200, "Executed Successfully!", action, event_time_utc)
 
@@ -56,11 +51,21 @@ def lambda_handler(event, _):
         return handle_response(500, "Execute Failed!", action, event_time_utc, str(e))
 
 
-def update_and_save_res_df(price_eviction_sps_df, event_time_utc):
+def handle_res_df(sps_res_df, event_time_utc):
     try:
-        update_result = update_latest_sps(price_eviction_sps_df, event_time_utc)
-        save_result = save_raw_sps(price_eviction_sps_df, event_time_utc)
-        return update_result and save_result
+        sps_res_df['time'] = event_time_utc.strftime("%Y-%m-%d %H:%M:%S")
+        sps_res_df['AvailabilityZone'] = sps_res_df['AvailabilityZone'].where(pd.notna(sps_res_df['AvailabilityZone']), None)
+
+
+        # price_if_df = S3.read_file(AZURE_CONST.S3_LATEST_PRICE_IF_GZIP_SAVE_PATH, 'pkl.gz')
+        price_if_df = pd.DataFrame(S3.read_file(AZURE_CONST.LATEST_FILENAME, 'json'))
+        price_eviction_sps_df = merge_price_eviction_sps_df(price_if_df, sps_res_df)
+
+        if price_if_df is None: raise ValueError("price_if_df is None")
+        if price_eviction_sps_df is None: raise ValueError("price_eviction_sps_df is None")
+
+        return update_latest_sps(price_eviction_sps_df) and save_raw_sps(price_eviction_sps_df, event_time_utc)
+
 
     except Exception as e:
         logger.error(f"Error in handle_res_df function: {e}")
diff --git a/collector/spot-dataset/azure/lambda/current_collector/load_sps.py b/collector/spot-dataset/azure/lambda/current_collector/load_sps.py
@@ -162,10 +162,10 @@ def execute_spot_placement_score_task_by_parameter_pool_df(api_calls_df, availab
                             score_data = {
                                 "DesiredCount": desired_count,
                                 "AvailabilityZone": score.get("availabilityZone", None),
-                                # "RegionCodeSPS": score.get("region", None),
+                                "RegionCodeSPS": score.get("region", None),
                                 "Region": SS_Resources.region_map_and_instance_map_tmp['region_map'].get(
                                     score.get("region", ""), ""),
-                                # "InstanceTypeSPS": score.get("sku", None),
+                                "InstanceTypeSPS": score.get("sku", None),
                                 "InstanceTier": SS_Resources.region_map_and_instance_map_tmp['instance_map'].get(
                                     score.get("sku", ""), {}).get("InstanceTier", None),
                                 "InstanceType": SS_Resources.region_map_and_instance_map_tmp['instance_map'].get(
diff --git a/collector/spot-dataset/azure/lambda/current_collector/utils/merge_df.py b/collector/spot-dataset/azure/lambda/current_collector/utils/merge_df.py
@@ -17,7 +17,7 @@ def merge_price_eviction_df(price_df, eviction_df):
 def merge_price_eviction_sps_df(price_eviction_df, sps_df):
     join_df = pd.merge(price_eviction_df, sps_df, on=['InstanceTier', 'InstanceType', 'Region'], how='outer')
     join_df.rename(columns={'time_x': 'PriceEviction_Update_Time', 'time_y': 'SPS_Update_Time'}, inplace=True)
-    join_df.drop(columns=['id_x', 'id_y'], inplace=True)
+    join_df.drop(columns=['id', 'InstanceTypeSPS', 'RegionCodeSPS'], inplace=True)
     join_df = join_df[["InstanceTier", "InstanceType", "Region", "OndemandPrice", "SpotPrice", "Savings", "IF",
                            "PriceEviction_Update_Time", "DesiredCount", "AvailabilityZone", "Score", "SPS_Update_Time"]]
 
diff --git a/collector/spot-dataset/azure/lambda/current_collector/utils/upload_data.py b/collector/spot-dataset/azure/lambda/current_collector/utils/upload_data.py
@@ -161,17 +161,9 @@ def upload_cloudwatch(data, timestamp):
     )
 
 
-def update_latest_sps(dataframe, time_utc):
+def update_latest_sps(dataframe):
     try:
-        formatted_time = time_utc.strftime("%Y-%m-%d %H:%M:%S")
-
-        dataframe['id'] = dataframe.index + 1
-        dataframe['time'] = formatted_time
-        dataframe = dataframe[['id', 'InstanceTier', 'InstanceType', 'Region', 'DesiredCount', 'AvailabilityZone', 'Score', 'InstanceTypeSPS', 'RegionCodeSPS', 'time']]
-        dataframe['AvailabilityZone'] = dataframe['AvailabilityZone'].where(pd.notna(dataframe['AvailabilityZone']), None)
-
         json_data = dataframe.to_dict(orient="records")
-
         S3.upload_file(json_data, f"{AZURE_CONST.LATEST_SPS_FILENAME}", "json", set_public_read=True)
         return True
 
@@ -182,9 +174,6 @@ def update_latest_sps(dataframe, time_utc):
 
 def save_raw_sps(dataframe, time_utc):
     try:
-        dataframe['Time'] = time_utc
-        dataframe = dataframe[['Time','InstanceTier','InstanceType', 'Region', 'DesiredCount', 'AvailabilityZone', 'Score', 'InstanceTypeSPS', 'RegionCodeSPS']]
-
         s3_dir_name = time_utc.strftime("%Y/%m/%d")
         s3_obj_name = time_utc.strftime("%H-%M-%S")
         S3.upload_file(dataframe, f"sps-collector/azure/result/rawdata/{s3_dir_name}/{s3_obj_name}.csv.gz", "df_to_csv.gz", set_public_read=True)