add worker metrics

yihuiwen · yihuiwen · commit e96fc15b3425 · 2025-09-29T19:17:54.000+08:00
diff --git a/lightx2v/deploy/worker/__main__.py b/lightx2v/deploy/worker/__main__.py
@@ -15,6 +15,7 @@
 from lightx2v.deploy.data_manager import LocalDataManager, S3DataManager
 from lightx2v.deploy.task_manager import TaskStatus
 from lightx2v.deploy.worker.hub import DiTWorker, ImageEncoderWorker, PipelineWorker, SegmentDiTWorker, TextEncoderWorker, VaeDecoderWorker, VaeEncoderWorker
+from lightx2v.server.metrics import metrics
 
 RUNNER_MAP = {
     "pipeline": PipelineWorker,
@@ -205,6 +206,8 @@ async def main(args):
         args.task_name = args.task
     worker_keys = [args.task_name, args.model_name, args.stage, args.worker]
 
+    metrics.server_process(args.metric_port)
+
     data_manager = None
     if args.data_url.startswith("/"):
         data_manager = LocalDataManager(args.data_url, None)
@@ -312,6 +315,8 @@ def force_exit():
     parser.add_argument("--timeout", type=int, default=300)
     parser.add_argument("--ping_interval", type=int, default=10)
 
+    parser.add_argument("--metric_port", type=int, default=8001)
+
     parser.add_argument("--model_path", type=str, required=True)
     parser.add_argument("--config_json", type=str, required=True)
 
diff --git a/lightx2v/models/runners/cogvideox/cogvidex_runner.py b/lightx2v/models/runners/cogvideox/cogvidex_runner.py
@@ -7,6 +7,8 @@
 from lightx2v.models.schedulers.cogvideox.scheduler import CogvideoxXDPMScheduler
 from lightx2v.models.video_encoders.hf.cogvideox.model import CogvideoxVAE
 from lightx2v.utils.registry_factory import RUNNER_REGISTER
+from lightx2v.server.metrics import monitor_cli
+from lightx2v.utils.metrics_profiler import MetricsProfilingContext
 
 
 @RUNNER_REGISTER("cogvideox")
@@ -33,7 +35,9 @@ def load_vae(self):
     def init_scheduler(self):
         self.scheduler = CogvideoxXDPMScheduler(self.config)
 
+    @MetricsProfilingContext(monitor_cli.lightx2v_run_text_encode_duration, labels=["CogvideoxRunner"])
     def run_text_encoder(self, text, img):
+        monitor_cli.lightx2v_input_prompt_len.observe(len(text))
         text_encoder_output = {}
         n_prompt = self.config.get("negative_prompt", "")
         context = self.text_encoders[0].infer([text], self.config)
diff --git a/lightx2v/models/runners/default_runner.py b/lightx2v/models/runners/default_runner.py
@@ -8,10 +8,12 @@
 from loguru import logger
 from requests.exceptions import RequestException
 
+from lightx2v.server.metrics import monitor_cli
 from lightx2v.utils.envs import *
 from lightx2v.utils.generate_task_id import generate_task_id
 from lightx2v.utils.memory_profiler import peak_memory_decorator
 from lightx2v.utils.profiler import *
+from lightx2v.utils.metrics_profiler import MetricsProfilingContext
 from lightx2v.utils.utils import save_to_video, vae_to_comfyui_image
 
 from .base_runner import BaseRunner
@@ -161,6 +163,8 @@ def read_image_input(self, img_path):
             img_ori = img_path
         else:
             img_ori = Image.open(img_path).convert("RGB")
+        width, height = img_ori.size
+        monitor_cli.lightx2v_input_image_len.observe(width*height)
         img = TF.to_tensor(img_ori).sub_(0.5).div_(0.5).unsqueeze(0).cuda()
         return img, img_ori
 
@@ -243,18 +247,21 @@ def run_main(self, total_steps=None):
         for segment_idx in range(self.video_segment_num):
             logger.info(f"🔄 start segment {segment_idx + 1}/{self.video_segment_num}")
             with ProfilingContext4DebugL1(f"segment end2end {segment_idx + 1}/{self.video_segment_num}"):
-                self.check_stop()
-                # 1. default do nothing
-                self.init_run_segment(segment_idx)
-                # 2. main inference loop
-                latents = self.run_segment(total_steps=total_steps)
-                # 3. vae decoder
-                self.gen_video = self.run_vae_decoder(latents)
-                # 4. default do nothing
-                self.end_run_segment(segment_idx)
+                with MetricsProfilingContext(monitor_cli.lightx2v_run_pre_step_dit_duration, labels=[segment_idx+1,
+                                                                                                     self.video_segment_num]):
+                    self.check_stop()
+                    # 1. default do nothing
+                    self.init_run_segment(segment_idx)
+                    # 2. main inference loop
+                    latents = self.run_segment(total_steps=total_steps)
+                    # 3. vae decoder
+                    self.gen_video = self.run_vae_decoder(latents)
+                    # 4. default do nothing
+                    self.end_run_segment(segment_idx)
         self.end_run()
 
     @ProfilingContext4DebugL1("Run VAE Decoder")
+    @MetricsProfilingContext(monitor_cli.lightx2v_run_vae_decode_duration, labels=["DefaultRunner"])
     def run_vae_decoder(self, latents):
         if self.config.get("lazy_load", False) or self.config.get("unload_modules", False):
             self.vae_decoder = self.load_vae_decoder()
@@ -309,7 +316,9 @@ def process_images_after_vae_decoder(self, save_video=True):
             return {"video": self.gen_video}
         return {"video": None}
 
+    @MetricsProfilingContext(monitor_cli.lightx2v_worker_request_duration, labels=["DefaultRunner"])
     def run_pipeline(self, save_video=True):
+        monitor_cli.lightx2v_worker_request_count.inc()
         if self.config["use_prompt_enhancer"]:
             self.config["prompt_enhanced"] = self.post_prompt_enhancer()
 
@@ -321,4 +330,5 @@ def run_pipeline(self, save_video=True):
         torch.cuda.empty_cache()
         gc.collect()
 
+        monitor_cli.lightx2v_worker_request_success.inc()
         return gen_video
diff --git a/lightx2v/models/runners/hunyuan/hunyuan_runner.py b/lightx2v/models/runners/hunyuan/hunyuan_runner.py
@@ -15,6 +15,8 @@
 from lightx2v.utils.envs import *
 from lightx2v.utils.registry_factory import RUNNER_REGISTER
 from lightx2v.utils.utils import save_videos_grid
+from lightx2v.utils.metrics_profiler import MetricsProfilingContext
+from lightx2v.server.metrics import monitor_cli
 
 
 @RUNNER_REGISTER("hunyuan")
@@ -56,7 +58,9 @@ def init_scheduler(self):
             raise NotImplementedError(f"Unsupported feature_caching type: {self.config.feature_caching}")
         self.model.set_scheduler(scheduler)
 
+    @MetricsProfilingContext(monitor_cli.lightx2v_run_text_encode_duration, labels=["HunyuanRunner"])
     def run_text_encoder(self, text, img):
+        monitor_cli.lightx2v_input_prompt_len.observe(len(text))
         text_encoder_output = {}
         for i, encoder in enumerate(self.text_encoders):
             if self.config.task == "i2v" and i == 0:
@@ -101,6 +105,7 @@ def generate_crop_size_list(base_size=256, patch_size=32, max_ratio=4.0):
     def run_image_encoder(self, img):
         return None
 
+    @MetricsProfilingContext(monitor_cli.lightx2v_run_vae_encode_duration, labels=["HunyuanRunner"])
     def run_vae_encoder(self, img):
         kwargs = {}
         if self.config.i2v_resolution == "720p":
diff --git a/lightx2v/models/runners/qwen_image/qwen_image_runner.py b/lightx2v/models/runners/qwen_image/qwen_image_runner.py
@@ -11,7 +11,9 @@
 from lightx2v.models.schedulers.qwen_image.scheduler import QwenImageScheduler
 from lightx2v.models.video_encoders.hf.qwen_image.vae import AutoencoderKLQwenImageVAE
 from lightx2v.utils.profiler import *
+from lightx2v.utils.metrics_profiler import MetricsProfilingContext
 from lightx2v.utils.registry_factory import RUNNER_REGISTER
+from lightx2v.server.metrics import monitor_cli
 
 
 def calculate_dimensions(target_area, ratio):
@@ -106,7 +108,9 @@ def _run_input_encoder_local_i2i(self):
             "image_encoder_output": image_encoder_output,
         }
 
+    @MetricsProfilingContext(monitor_cli.lightx2v_run_text_encode_duration, labels=["QwenImageRunner"])
     def run_text_encoder(self, text, image=None):
+        monitor_cli.lightx2v_input_prompt_len.observe(len(text))
         text_encoder_output = {}
         if self.config["task"] == "t2i":
             prompt_embeds, prompt_embeds_mask, _, _ = self.text_encoders[0].infer([text])
@@ -120,6 +124,7 @@ def run_text_encoder(self, text, image=None):
             text_encoder_output["image_info"] = image_info
         return text_encoder_output
 
+    @MetricsProfilingContext(monitor_cli.lightx2v_run_vae_encode_duration, labels=["QwenImageRunner"])
     def run_vae_encoder(self, image):
         image_latents = self.vae.encode_vae_image(image)
         return {"image_latents": image_latents}
diff --git a/lightx2v/models/runners/wan/wan_animate_runner.py b/lightx2v/models/runners/wan/wan_animate_runner.py
@@ -13,8 +13,10 @@
 from lightx2v.models.runners.wan.wan_runner import WanRunner
 from lightx2v.utils.envs import *
 from lightx2v.utils.profiler import *
+from lightx2v.utils.metrics_profiler import MetricsProfilingContext
 from lightx2v.utils.registry_factory import RUNNER_REGISTER
 from lightx2v.utils.utils import load_weights, remove_substrings_from_keys
+from lightx2v.server.metrics import monitor_cli
 
 
 @RUNNER_REGISTER("wan2.2_animate")
@@ -143,6 +145,7 @@ def run_image_encoders(
         )
         return {"image_encoder_output": {"clip_encoder_out": clip_encoder_out, "vae_encoder_out": vae_encoder_out, "pose_latents": pose_latents, "face_pixel_values": face_pixel_values}}
 
+    @MetricsProfilingContext(monitor_cli.lightx2v_run_vae_encode_duration, labels=["WanAnimateRunner"])
     def run_vae_encoder(
         self,
         conditioning_pixel_values,
@@ -259,6 +262,7 @@ def init_run(self):
         super().init_run()
 
     @ProfilingContext4DebugL1("Run VAE Decoder")
+    @MetricsProfilingContext(monitor_cli.lightx2v_run_vae_decode_duration, labels=["WanAnimateRunner"])
     def run_vae_decoder(self, latents):
         if self.config.get("lazy_load", False) or self.config.get("unload_modules", False):
             self.vae_decoder = self.load_vae_decoder()
@@ -347,6 +351,7 @@ def set_target_shape(self):
         self.config.lat_t = self.config.target_video_length // 4 + 1
         self.config.target_shape = [16, self.config.lat_t + 1, self.config.lat_h, self.config.lat_w]
 
+    @MetricsProfilingContext(monitor_cli.lightx2v_run_img_encode_duration, labels=["WanAnimateRunner"])
     def run_image_encoder(self, img):  # CHW
         if self.config.get("lazy_load", False) or self.config.get("unload_modules", False):
             self.image_encoder = self.load_image_encoder()
diff --git a/lightx2v/models/runners/wan/wan_audio_runner.py b/lightx2v/models/runners/wan/wan_audio_runner.py
@@ -26,8 +26,10 @@
 from lightx2v.models.runners.wan.wan_runner import WanRunner
 from lightx2v.models.schedulers.wan.audio.scheduler import EulerScheduler
 from lightx2v.models.video_encoders.hf.wan.vae_2_2 import Wan2_2_VAE
+from lightx2v.server.metrics import monitor_cli
 from lightx2v.utils.envs import *
 from lightx2v.utils.profiler import *
+from lightx2v.utils.metrics_profiler import MetricsProfilingContext
 from lightx2v.utils.registry_factory import RUNNER_REGISTER
 from lightx2v.utils.utils import find_torch_model_path, load_weights, vae_to_comfyui_image_inplace
 
@@ -359,6 +361,8 @@ def read_audio_input(self):
 
         video_duration = self.config.get("video_duration", 5)
         audio_len = int(audio_array.shape[1] / audio_sr * target_fps)
+        monitor_cli.lightx2v_input_audio_len.observe(audio_len)
+
         expected_frames = min(max(1, int(video_duration * target_fps)), audio_len)
 
         # Segment audio
@@ -472,6 +476,7 @@ def read_image_input(self, img_path):
         ref_img = torch.nn.functional.interpolate(ref_img, size=(self.config.tgt_h, self.config.tgt_w), mode="bicubic")
         return ref_img
 
+    @MetricsProfilingContext(monitor_cli.lightx2v_run_img_encode_duration, labels=["WanAudioRunner"])
     def run_image_encoder(self, first_frame, last_frame=None):
         if self.config.get("lazy_load", False) or self.config.get("unload_modules", False):
             self.image_encoder = self.load_image_encoder()
@@ -482,6 +487,7 @@ def run_image_encoder(self, first_frame, last_frame=None):
             gc.collect()
         return clip_encoder_out
 
+    @MetricsProfilingContext(monitor_cli.lightx2v_run_vae_encode_duration, labels=["WanAudioRunner"])
     def run_vae_encoder(self, img):
         if self.config.get("lazy_load", False) or self.config.get("unload_modules", False):
             self.vae_encoder = self.load_vae_encoder()
diff --git a/lightx2v/models/runners/wan/wan_runner.py b/lightx2v/models/runners/wan/wan_runner.py
@@ -26,9 +26,11 @@
 from lightx2v.models.video_encoders.hf.wan.vae_tiny import Wan2_2_VAE_tiny, WanVAE_tiny
 from lightx2v.utils.envs import *
 from lightx2v.utils.profiler import *
+from lightx2v.utils.metrics_profiler import MetricsProfilingContext
 from lightx2v.utils.registry_factory import RUNNER_REGISTER
 from lightx2v.utils.utils import *
 from lightx2v.utils.utils import best_output_size, cache_video
+from lightx2v.server.metrics import monitor_cli
 
 
 @RUNNER_REGISTER("wan2.1")
@@ -206,7 +208,9 @@ def init_scheduler(self):
         else:
             self.scheduler = scheduler_class(self.config)
 
+    @MetricsProfilingContext(monitor_cli.lightx2v_run_text_encode_duration, labels=["WanRunner"])
     def run_text_encoder(self, text, img=None):
+        monitor_cli.lightx2v_input_prompt_len.observe(len(text))
         if self.config.get("lazy_load", False) or self.config.get("unload_modules", False):
             self.text_encoders = self.load_text_encoder()
         n_prompt = self.config.get("negative_prompt", "")
@@ -239,6 +243,7 @@ def run_text_encoder(self, text, img=None):
 
         return text_encoder_output
 
+    @MetricsProfilingContext(monitor_cli.lightx2v_run_img_encode_duration, labels=["WanRunner"])
     def run_image_encoder(self, first_frame, last_frame=None):
         if self.config.get("lazy_load", False) or self.config.get("unload_modules", False):
             self.image_encoder = self.load_image_encoder()
@@ -252,6 +257,7 @@ def run_image_encoder(self, first_frame, last_frame=None):
             gc.collect()
         return clip_encoder_out
 
+    @MetricsProfilingContext(monitor_cli.lightx2v_run_vae_encode_duration, labels=["WanRunner"])
     def run_vae_encoder(self, first_frame, last_frame=None):
         h, w = first_frame.shape[2:]
         aspect_ratio = h / w
@@ -477,6 +483,7 @@ def __init__(self, config):
         self.vae_name = "Wan2.2_VAE.pth"
         self.tiny_vae_name = "taew2_2.pth"
 
+    @MetricsProfilingContext(monitor_cli.lightx2v_run_vae_encode_duration, labels=["Wan22DenseRunner"])
     def run_vae_encoder(self, img):
         max_area = self.config.target_height * self.config.target_width
         ih, iw = img.height, img.width
diff --git a/lightx2v/models/runners/wan/wan_skyreels_v2_df_runner.py b/lightx2v/models/runners/wan/wan_skyreels_v2_df_runner.py
@@ -11,7 +11,9 @@
 from lightx2v.models.schedulers.wan.df.skyreels_v2_df_scheduler import WanSkyreelsV2DFScheduler
 from lightx2v.utils.envs import *
 from lightx2v.utils.profiler import *
+from lightx2v.utils.metrics_profiler import MetricsProfilingContext
 from lightx2v.utils.registry_factory import RUNNER_REGISTER
+from lightx2v.server.metrics import monitor_cli
 
 
 @RUNNER_REGISTER("wan2.1_skyreels_v2_df")
@@ -22,6 +24,7 @@ def __init__(self, config):
     def init_scheduler(self):
         self.scheduler = WanSkyreelsV2DFScheduler(self.config)
 
+    @MetricsProfilingContext(monitor_cli.lightx2v_run_img_encode_duration, labels=["WanSkyreelsV2DFRunner"])
     def run_image_encoder(self, config, image_encoder, vae_model):
         img = Image.open(config.image_path).convert("RGB")
         img = TF.to_tensor(img).sub_(0.5).div_(0.5).cuda()
diff --git a/lightx2v/models/runners/wan/wan_vace_runner.py b/lightx2v/models/runners/wan/wan_vace_runner.py
@@ -10,7 +10,9 @@
 from lightx2v.models.runners.wan.wan_runner import WanRunner
 from lightx2v.utils.envs import *
 from lightx2v.utils.profiler import *
+from lightx2v.utils.metrics_profiler import MetricsProfilingContext
 from lightx2v.utils.registry_factory import RUNNER_REGISTER
+from lightx2v.server.metrics import monitor_cli
 
 
 @RUNNER_REGISTER("wan2.1_vace")
@@ -88,6 +90,7 @@ def prepare_source(self, src_video, src_mask, src_ref_images, image_size, device
                         src_ref_images[i][j] = ref_img.to(device)
         return src_video, src_mask, src_ref_images
 
+    @MetricsProfilingContext(monitor_cli.lightx2v_run_vae_encode_duration, labels=["WanVaceRunner"])
     def run_vae_encoder(self, frames, ref_images, masks):
         if self.config.get("lazy_load", False) or self.config.get("unload_modules", False):
             self.vae_encoder = self.load_vae_encoder()
@@ -160,6 +163,7 @@ def set_target_shape(self):
         self.config.target_shape = target_shape
 
     @ProfilingContext4DebugL1("Run VAE Decoder")
+    @MetricsProfilingContext(monitor_cli.lightx2v_run_vae_decode_duration, labels=["WanVaceRunner"])
     def run_vae_decoder(self, latents):
         if self.config.get("lazy_load", False) or self.config.get("unload_modules", False):
             self.vae_decoder = self.load_vae_decoder()
diff --git a/lightx2v/server/metrics/__init__.py b/lightx2v/server/metrics/__init__.py
@@ -0,0 +1,7 @@
+# -*-coding=utf-8-*-
+
+from .metrics import server_process
+
+from .monitor import Monitor
+
+monitor_cli = Monitor()
diff --git a/lightx2v/server/metrics/metrics.py b/lightx2v/server/metrics/metrics.py
diff --git a/lightx2v/server/metrics/monitor.py b/lightx2v/server/metrics/monitor.py
diff --git a/lightx2v/utils/metrics_profiler.py b/lightx2v/utils/metrics_profiler.py