Skip to content

Commit e96fc15

Browse files
author
yihuiwen
committed
add worker metrics
1 parent 131c8a4 commit e96fc15

File tree

14 files changed

+251
-9
lines changed

14 files changed

+251
-9
lines changed

lightx2v/deploy/worker/__main__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from lightx2v.deploy.data_manager import LocalDataManager, S3DataManager
1616
from lightx2v.deploy.task_manager import TaskStatus
1717
from lightx2v.deploy.worker.hub import DiTWorker, ImageEncoderWorker, PipelineWorker, SegmentDiTWorker, TextEncoderWorker, VaeDecoderWorker, VaeEncoderWorker
18+
from lightx2v.server.metrics import metrics
1819

1920
RUNNER_MAP = {
2021
"pipeline": PipelineWorker,
@@ -205,6 +206,8 @@ async def main(args):
205206
args.task_name = args.task
206207
worker_keys = [args.task_name, args.model_name, args.stage, args.worker]
207208

209+
metrics.server_process(args.metric_port)
210+
208211
data_manager = None
209212
if args.data_url.startswith("/"):
210213
data_manager = LocalDataManager(args.data_url, None)
@@ -312,6 +315,8 @@ def force_exit():
312315
parser.add_argument("--timeout", type=int, default=300)
313316
parser.add_argument("--ping_interval", type=int, default=10)
314317

318+
parser.add_argument("--metric_port", type=int, default=8001)
319+
315320
parser.add_argument("--model_path", type=str, required=True)
316321
parser.add_argument("--config_json", type=str, required=True)
317322

lightx2v/models/runners/cogvideox/cogvidex_runner.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
from lightx2v.models.schedulers.cogvideox.scheduler import CogvideoxXDPMScheduler
88
from lightx2v.models.video_encoders.hf.cogvideox.model import CogvideoxVAE
99
from lightx2v.utils.registry_factory import RUNNER_REGISTER
10+
from lightx2v.server.metrics import monitor_cli
11+
from lightx2v.utils.metrics_profiler import MetricsProfilingContext
1012

1113

1214
@RUNNER_REGISTER("cogvideox")
@@ -33,7 +35,9 @@ def load_vae(self):
3335
def init_scheduler(self):
3436
self.scheduler = CogvideoxXDPMScheduler(self.config)
3537

38+
@MetricsProfilingContext(monitor_cli.lightx2v_run_text_encode_duration, labels=["CogvideoxRunner"])
3639
def run_text_encoder(self, text, img):
40+
monitor_cli.lightx2v_input_prompt_len.observe(len(text))
3741
text_encoder_output = {}
3842
n_prompt = self.config.get("negative_prompt", "")
3943
context = self.text_encoders[0].infer([text], self.config)

lightx2v/models/runners/default_runner.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,12 @@
88
from loguru import logger
99
from requests.exceptions import RequestException
1010

11+
from lightx2v.server.metrics import monitor_cli
1112
from lightx2v.utils.envs import *
1213
from lightx2v.utils.generate_task_id import generate_task_id
1314
from lightx2v.utils.memory_profiler import peak_memory_decorator
1415
from lightx2v.utils.profiler import *
16+
from lightx2v.utils.metrics_profiler import MetricsProfilingContext
1517
from lightx2v.utils.utils import save_to_video, vae_to_comfyui_image
1618

1719
from .base_runner import BaseRunner
@@ -161,6 +163,8 @@ def read_image_input(self, img_path):
161163
img_ori = img_path
162164
else:
163165
img_ori = Image.open(img_path).convert("RGB")
166+
width, height = img_ori.size
167+
monitor_cli.lightx2v_input_image_len.observe(width*height)
164168
img = TF.to_tensor(img_ori).sub_(0.5).div_(0.5).unsqueeze(0).cuda()
165169
return img, img_ori
166170

@@ -243,18 +247,21 @@ def run_main(self, total_steps=None):
243247
for segment_idx in range(self.video_segment_num):
244248
logger.info(f"🔄 start segment {segment_idx + 1}/{self.video_segment_num}")
245249
with ProfilingContext4DebugL1(f"segment end2end {segment_idx + 1}/{self.video_segment_num}"):
246-
self.check_stop()
247-
# 1. default do nothing
248-
self.init_run_segment(segment_idx)
249-
# 2. main inference loop
250-
latents = self.run_segment(total_steps=total_steps)
251-
# 3. vae decoder
252-
self.gen_video = self.run_vae_decoder(latents)
253-
# 4. default do nothing
254-
self.end_run_segment(segment_idx)
250+
with MetricsProfilingContext(monitor_cli.lightx2v_run_pre_step_dit_duration, labels=[segment_idx+1,
251+
self.video_segment_num]):
252+
self.check_stop()
253+
# 1. default do nothing
254+
self.init_run_segment(segment_idx)
255+
# 2. main inference loop
256+
latents = self.run_segment(total_steps=total_steps)
257+
# 3. vae decoder
258+
self.gen_video = self.run_vae_decoder(latents)
259+
# 4. default do nothing
260+
self.end_run_segment(segment_idx)
255261
self.end_run()
256262

257263
@ProfilingContext4DebugL1("Run VAE Decoder")
264+
@MetricsProfilingContext(monitor_cli.lightx2v_run_vae_decode_duration, labels=["DefaultRunner"])
258265
def run_vae_decoder(self, latents):
259266
if self.config.get("lazy_load", False) or self.config.get("unload_modules", False):
260267
self.vae_decoder = self.load_vae_decoder()
@@ -309,7 +316,9 @@ def process_images_after_vae_decoder(self, save_video=True):
309316
return {"video": self.gen_video}
310317
return {"video": None}
311318

319+
@MetricsProfilingContext(monitor_cli.lightx2v_worker_request_duration, labels=["DefaultRunner"])
312320
def run_pipeline(self, save_video=True):
321+
monitor_cli.lightx2v_worker_request_count.inc()
313322
if self.config["use_prompt_enhancer"]:
314323
self.config["prompt_enhanced"] = self.post_prompt_enhancer()
315324

@@ -321,4 +330,5 @@ def run_pipeline(self, save_video=True):
321330
torch.cuda.empty_cache()
322331
gc.collect()
323332

333+
monitor_cli.lightx2v_worker_request_success.inc()
324334
return gen_video

lightx2v/models/runners/hunyuan/hunyuan_runner.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
from lightx2v.utils.envs import *
1616
from lightx2v.utils.registry_factory import RUNNER_REGISTER
1717
from lightx2v.utils.utils import save_videos_grid
18+
from lightx2v.utils.metrics_profiler import MetricsProfilingContext
19+
from lightx2v.server.metrics import monitor_cli
1820

1921

2022
@RUNNER_REGISTER("hunyuan")
@@ -56,7 +58,9 @@ def init_scheduler(self):
5658
raise NotImplementedError(f"Unsupported feature_caching type: {self.config.feature_caching}")
5759
self.model.set_scheduler(scheduler)
5860

61+
@MetricsProfilingContext(monitor_cli.lightx2v_run_text_encode_duration, labels=["HunyuanRunner"])
5962
def run_text_encoder(self, text, img):
63+
monitor_cli.lightx2v_input_prompt_len.observe(len(text))
6064
text_encoder_output = {}
6165
for i, encoder in enumerate(self.text_encoders):
6266
if self.config.task == "i2v" and i == 0:
@@ -101,6 +105,7 @@ def generate_crop_size_list(base_size=256, patch_size=32, max_ratio=4.0):
101105
def run_image_encoder(self, img):
102106
return None
103107

108+
@MetricsProfilingContext(monitor_cli.lightx2v_run_vae_encode_duration, labels=["HunyuanRunner"])
104109
def run_vae_encoder(self, img):
105110
kwargs = {}
106111
if self.config.i2v_resolution == "720p":

lightx2v/models/runners/qwen_image/qwen_image_runner.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@
1111
from lightx2v.models.schedulers.qwen_image.scheduler import QwenImageScheduler
1212
from lightx2v.models.video_encoders.hf.qwen_image.vae import AutoencoderKLQwenImageVAE
1313
from lightx2v.utils.profiler import *
14+
from lightx2v.utils.metrics_profiler import MetricsProfilingContext
1415
from lightx2v.utils.registry_factory import RUNNER_REGISTER
16+
from lightx2v.server.metrics import monitor_cli
1517

1618

1719
def calculate_dimensions(target_area, ratio):
@@ -106,7 +108,9 @@ def _run_input_encoder_local_i2i(self):
106108
"image_encoder_output": image_encoder_output,
107109
}
108110

111+
@MetricsProfilingContext(monitor_cli.lightx2v_run_text_encode_duration, labels=["QwenImageRunner"])
109112
def run_text_encoder(self, text, image=None):
113+
monitor_cli.lightx2v_input_prompt_len.observe(len(text))
110114
text_encoder_output = {}
111115
if self.config["task"] == "t2i":
112116
prompt_embeds, prompt_embeds_mask, _, _ = self.text_encoders[0].infer([text])
@@ -120,6 +124,7 @@ def run_text_encoder(self, text, image=None):
120124
text_encoder_output["image_info"] = image_info
121125
return text_encoder_output
122126

127+
@MetricsProfilingContext(monitor_cli.lightx2v_run_vae_encode_duration, labels=["QwenImageRunner"])
123128
def run_vae_encoder(self, image):
124129
image_latents = self.vae.encode_vae_image(image)
125130
return {"image_latents": image_latents}

lightx2v/models/runners/wan/wan_animate_runner.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,10 @@
1313
from lightx2v.models.runners.wan.wan_runner import WanRunner
1414
from lightx2v.utils.envs import *
1515
from lightx2v.utils.profiler import *
16+
from lightx2v.utils.metrics_profiler import MetricsProfilingContext
1617
from lightx2v.utils.registry_factory import RUNNER_REGISTER
1718
from lightx2v.utils.utils import load_weights, remove_substrings_from_keys
19+
from lightx2v.server.metrics import monitor_cli
1820

1921

2022
@RUNNER_REGISTER("wan2.2_animate")
@@ -143,6 +145,7 @@ def run_image_encoders(
143145
)
144146
return {"image_encoder_output": {"clip_encoder_out": clip_encoder_out, "vae_encoder_out": vae_encoder_out, "pose_latents": pose_latents, "face_pixel_values": face_pixel_values}}
145147

148+
@MetricsProfilingContext(monitor_cli.lightx2v_run_vae_encode_duration, labels=["WanAnimateRunner"])
146149
def run_vae_encoder(
147150
self,
148151
conditioning_pixel_values,
@@ -259,6 +262,7 @@ def init_run(self):
259262
super().init_run()
260263

261264
@ProfilingContext4DebugL1("Run VAE Decoder")
265+
@MetricsProfilingContext(monitor_cli.lightx2v_run_vae_decode_duration, labels=["WanAnimateRunner"])
262266
def run_vae_decoder(self, latents):
263267
if self.config.get("lazy_load", False) or self.config.get("unload_modules", False):
264268
self.vae_decoder = self.load_vae_decoder()
@@ -347,6 +351,7 @@ def set_target_shape(self):
347351
self.config.lat_t = self.config.target_video_length // 4 + 1
348352
self.config.target_shape = [16, self.config.lat_t + 1, self.config.lat_h, self.config.lat_w]
349353

354+
@MetricsProfilingContext(monitor_cli.lightx2v_run_img_encode_duration, labels=["WanAnimateRunner"])
350355
def run_image_encoder(self, img): # CHW
351356
if self.config.get("lazy_load", False) or self.config.get("unload_modules", False):
352357
self.image_encoder = self.load_image_encoder()

lightx2v/models/runners/wan/wan_audio_runner.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,10 @@
2626
from lightx2v.models.runners.wan.wan_runner import WanRunner
2727
from lightx2v.models.schedulers.wan.audio.scheduler import EulerScheduler
2828
from lightx2v.models.video_encoders.hf.wan.vae_2_2 import Wan2_2_VAE
29+
from lightx2v.server.metrics import monitor_cli
2930
from lightx2v.utils.envs import *
3031
from lightx2v.utils.profiler import *
32+
from lightx2v.utils.metrics_profiler import MetricsProfilingContext
3133
from lightx2v.utils.registry_factory import RUNNER_REGISTER
3234
from lightx2v.utils.utils import find_torch_model_path, load_weights, vae_to_comfyui_image_inplace
3335

@@ -359,6 +361,8 @@ def read_audio_input(self):
359361

360362
video_duration = self.config.get("video_duration", 5)
361363
audio_len = int(audio_array.shape[1] / audio_sr * target_fps)
364+
monitor_cli.lightx2v_input_audio_len.observe(audio_len)
365+
362366
expected_frames = min(max(1, int(video_duration * target_fps)), audio_len)
363367

364368
# Segment audio
@@ -472,6 +476,7 @@ def read_image_input(self, img_path):
472476
ref_img = torch.nn.functional.interpolate(ref_img, size=(self.config.tgt_h, self.config.tgt_w), mode="bicubic")
473477
return ref_img
474478

479+
@MetricsProfilingContext(monitor_cli.lightx2v_run_img_encode_duration, labels=["WanAudioRunner"])
475480
def run_image_encoder(self, first_frame, last_frame=None):
476481
if self.config.get("lazy_load", False) or self.config.get("unload_modules", False):
477482
self.image_encoder = self.load_image_encoder()
@@ -482,6 +487,7 @@ def run_image_encoder(self, first_frame, last_frame=None):
482487
gc.collect()
483488
return clip_encoder_out
484489

490+
@MetricsProfilingContext(monitor_cli.lightx2v_run_vae_encode_duration, labels=["WanAudioRunner"])
485491
def run_vae_encoder(self, img):
486492
if self.config.get("lazy_load", False) or self.config.get("unload_modules", False):
487493
self.vae_encoder = self.load_vae_encoder()

lightx2v/models/runners/wan/wan_runner.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,11 @@
2626
from lightx2v.models.video_encoders.hf.wan.vae_tiny import Wan2_2_VAE_tiny, WanVAE_tiny
2727
from lightx2v.utils.envs import *
2828
from lightx2v.utils.profiler import *
29+
from lightx2v.utils.metrics_profiler import MetricsProfilingContext
2930
from lightx2v.utils.registry_factory import RUNNER_REGISTER
3031
from lightx2v.utils.utils import *
3132
from lightx2v.utils.utils import best_output_size, cache_video
33+
from lightx2v.server.metrics import monitor_cli
3234

3335

3436
@RUNNER_REGISTER("wan2.1")
@@ -206,7 +208,9 @@ def init_scheduler(self):
206208
else:
207209
self.scheduler = scheduler_class(self.config)
208210

211+
@MetricsProfilingContext(monitor_cli.lightx2v_run_text_encode_duration, labels=["WanRunner"])
209212
def run_text_encoder(self, text, img=None):
213+
monitor_cli.lightx2v_input_prompt_len.observe(len(text))
210214
if self.config.get("lazy_load", False) or self.config.get("unload_modules", False):
211215
self.text_encoders = self.load_text_encoder()
212216
n_prompt = self.config.get("negative_prompt", "")
@@ -239,6 +243,7 @@ def run_text_encoder(self, text, img=None):
239243

240244
return text_encoder_output
241245

246+
@MetricsProfilingContext(monitor_cli.lightx2v_run_img_encode_duration, labels=["WanRunner"])
242247
def run_image_encoder(self, first_frame, last_frame=None):
243248
if self.config.get("lazy_load", False) or self.config.get("unload_modules", False):
244249
self.image_encoder = self.load_image_encoder()
@@ -252,6 +257,7 @@ def run_image_encoder(self, first_frame, last_frame=None):
252257
gc.collect()
253258
return clip_encoder_out
254259

260+
@MetricsProfilingContext(monitor_cli.lightx2v_run_vae_encode_duration, labels=["WanRunner"])
255261
def run_vae_encoder(self, first_frame, last_frame=None):
256262
h, w = first_frame.shape[2:]
257263
aspect_ratio = h / w
@@ -477,6 +483,7 @@ def __init__(self, config):
477483
self.vae_name = "Wan2.2_VAE.pth"
478484
self.tiny_vae_name = "taew2_2.pth"
479485

486+
@MetricsProfilingContext(monitor_cli.lightx2v_run_vae_encode_duration, labels=["Wan22DenseRunner"])
480487
def run_vae_encoder(self, img):
481488
max_area = self.config.target_height * self.config.target_width
482489
ih, iw = img.height, img.width

lightx2v/models/runners/wan/wan_skyreels_v2_df_runner.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@
1111
from lightx2v.models.schedulers.wan.df.skyreels_v2_df_scheduler import WanSkyreelsV2DFScheduler
1212
from lightx2v.utils.envs import *
1313
from lightx2v.utils.profiler import *
14+
from lightx2v.utils.metrics_profiler import MetricsProfilingContext
1415
from lightx2v.utils.registry_factory import RUNNER_REGISTER
16+
from lightx2v.server.metrics import monitor_cli
1517

1618

1719
@RUNNER_REGISTER("wan2.1_skyreels_v2_df")
@@ -22,6 +24,7 @@ def __init__(self, config):
2224
def init_scheduler(self):
2325
self.scheduler = WanSkyreelsV2DFScheduler(self.config)
2426

27+
@MetricsProfilingContext(monitor_cli.lightx2v_run_img_encode_duration, labels=["WanSkyreelsV2DFRunner"])
2528
def run_image_encoder(self, config, image_encoder, vae_model):
2629
img = Image.open(config.image_path).convert("RGB")
2730
img = TF.to_tensor(img).sub_(0.5).div_(0.5).cuda()

lightx2v/models/runners/wan/wan_vace_runner.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010
from lightx2v.models.runners.wan.wan_runner import WanRunner
1111
from lightx2v.utils.envs import *
1212
from lightx2v.utils.profiler import *
13+
from lightx2v.utils.metrics_profiler import MetricsProfilingContext
1314
from lightx2v.utils.registry_factory import RUNNER_REGISTER
15+
from lightx2v.server.metrics import monitor_cli
1416

1517

1618
@RUNNER_REGISTER("wan2.1_vace")
@@ -88,6 +90,7 @@ def prepare_source(self, src_video, src_mask, src_ref_images, image_size, device
8890
src_ref_images[i][j] = ref_img.to(device)
8991
return src_video, src_mask, src_ref_images
9092

93+
@MetricsProfilingContext(monitor_cli.lightx2v_run_vae_encode_duration, labels=["WanVaceRunner"])
9194
def run_vae_encoder(self, frames, ref_images, masks):
9295
if self.config.get("lazy_load", False) or self.config.get("unload_modules", False):
9396
self.vae_encoder = self.load_vae_encoder()
@@ -160,6 +163,7 @@ def set_target_shape(self):
160163
self.config.target_shape = target_shape
161164

162165
@ProfilingContext4DebugL1("Run VAE Decoder")
166+
@MetricsProfilingContext(monitor_cli.lightx2v_run_vae_decode_duration, labels=["WanVaceRunner"])
163167
def run_vae_decoder(self, latents):
164168
if self.config.get("lazy_load", False) or self.config.get("unload_modules", False):
165169
self.vae_decoder = self.load_vae_decoder()

0 commit comments

Comments
 (0)