From 0683c454aebe42466d5a7c8a344930ed16751980 Mon Sep 17 00:00:00 2001 From: John Liu BUAA Date: Thu, 23 Oct 2025 15:25:50 +0800 Subject: [PATCH] [Merge]PR-25233 From Original vLLM repo by fake0fan --- .../disagg_encoder/disagg_encoder_flow.png | Bin 0 -> 85914 bytes docs/features/disagg_encoder.md | 75 ++ .../disaggregated_encoder/README.md | 13 + .../disagg_1e1p1d_example.sh | 193 ++++ .../disagg_1e1pd_example.sh | 162 +++ .../disaggregated_encoder/disagg_epd_proxy.py | 516 ++++++++++ tests/v1/core/test_scheduler.py | 931 +++++++++++++++++- tests/v1/core/utils.py | 68 +- .../unit/test_ec_shared_storage_connector.py | 609 ++++++++++++ vllm/config/__init__.py | 3 + vllm/config/ec_transfer.py | 110 +++ vllm/config/vllm.py | 7 + vllm/distributed/ec_transfer/__init__.py | 14 + .../ec_transfer/ec_connector/__init__.py | 0 .../ec_transfer/ec_connector/base.py | 247 +++++ .../ec_transfer/ec_connector/factory.py | 88 ++ .../ec_connector/shared_storage_connector.py | 201 ++++ .../ec_transfer/ec_transfer_state.py | 46 + vllm/engine/arg_utils.py | 7 + vllm/model_executor/warmup/kernel_warmup.py | 16 +- vllm/v1/core/sched/output.py | 5 + vllm/v1/core/sched/scheduler.py | 46 +- .../worker/ec_connector_model_runner_mixin.py | 87 ++ 23 files changed, 3406 insertions(+), 38 deletions(-) create mode 100644 docs/assets/features/disagg_encoder/disagg_encoder_flow.png create mode 100644 docs/features/disagg_encoder.md create mode 100644 examples/online_serving/disaggregated_encoder/README.md create mode 100644 examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh create mode 100644 examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh create mode 100644 examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py create mode 100644 tests/v1/ec_connector/unit/test_ec_shared_storage_connector.py create mode 100644 vllm/config/ec_transfer.py create mode 100644 vllm/distributed/ec_transfer/__init__.py create mode 100644 vllm/distributed/ec_transfer/ec_connector/__init__.py create mode 100644 vllm/distributed/ec_transfer/ec_connector/base.py create mode 100644 vllm/distributed/ec_transfer/ec_connector/factory.py create mode 100644 vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py create mode 100644 vllm/distributed/ec_transfer/ec_transfer_state.py create mode 100644 vllm/v1/worker/ec_connector_model_runner_mixin.py diff --git a/docs/assets/features/disagg_encoder/disagg_encoder_flow.png b/docs/assets/features/disagg_encoder/disagg_encoder_flow.png new file mode 100644 index 0000000000000000000000000000000000000000..2951468c11d9a55f98af4c26eda60cebebb169f4 GIT binary patch literal 85914 zcmeEP2UwF?)((gYC;}>|h=d}dBGLsE2}PudpeRU}-U+=K2q;aEq9|Pu6+~$&B0V6z zHz^h%^eP=BwB)~$#9`Fknce+&cbuJh=9x)uzH+~F&VA20?>YD5rt*0iDoO@Q2n0eU zCwt~11hRz_0wHxF-vUNO z^YNI%?H$DkyEH#E~TvD34%KW=U00EWQtdL~BJgaZ`eZf2I2dhC2>1&{N9 zpAM^;SsTKg?ZL zo{gy@+|bApT-eZM{SthFLWEfwx~`9i@QSQY(!`E`nE6A+4YPWoV;T*5yc)rX3s( zuD3yH8-FGqWaT1dWQiB{H*y4a^N*kVRT_3*R{~+$|J3E|_=Wh)d5le^g!qJ|3``^( zl_mJO_}3M0@8C+bS0h6pYrac|2Ezda_B!l-V!9TVbLm|`FTQCUVdSM z3;p@?e%|kSo1a=0KQkT@n9~ib;-ZnQqYL1UdZK(g~S(=&Pi>3ykyo0Za zO5zIyW}v<&LHNPS%+L^DkCOaR#o^ak;ri`Y z|8O-2-tUv}@APG4Z76{+(&80lsb_C*X7Fq6{Z9G+>yBZAbcv@i?Nn{r79gcSe?1Sdpn{`bKNFAvcp5k1+r_xySB5RbVUpnIOU+^OTyA5&wdhq?9iK>OaaY_P2r26+6?r+}mr?p%D@A310B&z1+ z`Nj+K5y5Gr7bM!+AKMEO!?F!w_5V&Z!^TGQ{6!hynt?E}oqDHpu<|&(nFoyC)Iu zh{pd<8{+yi^GgAGR}b|1f8CE@gQq&}@iy`I>Al}4GeKeoN`UaMGW*8}QQ~C3V`6!~ zy8%%^|JV(_(3u3Lai=INn>hAFWjUP!|N7+kl)uxh{T=#(^K|;`_ZDe_k&pAV^372}_8d z7)@?iLd4awaRq&w?fmyw&|j(S_p$HyBR^o~1q6ScRSFO@TOvCC|0fV9cz@%T|8)NF z9}Izih#lWIuh;``3VbP#fgaeu_nlAu`QzXdM0g+|$X9-R1A=_*PWod*2=N^g|JgwJ zZ$?V%>+bIaszkQ2^74|>7n!db_V?&V={~ix@LM`%NesRZ-ZH~ZQ{`M*) zBwT;`#PSE5NPenSC`e@Me*V0YxC!rP1;lSv@e`+&#F%%3D!y|_e>Efio@PY8Z=K!0 zBSZPMAO7x^KW&D>`&Xj;eOmne$PW_bZ`)G^g#QCQvBJb)kCY z-$m;`v3DYx*eVggO-#vj=kUy(`!e_SlDuQc;}kNyKk{%xo&DD)qQ@4xfpKe+2p8{qT(m1}-)dEbxxz!twZ z3u1Q}k>LOT=RE#)M;q~;Kd&KvUzh!RplA)Z#=qLZP|qH`aupO0@LQZV^fvvvY_xu% zA9OnX>pcGk??a@dH^zR%vH#3QsBh^HLA)X3BXy#6{2MIt+cf99d;Yu@`ThF^4lCI> zfIroM|N9Qge9GXz_J9ul6DK{w#9kvJwMB%u4QBtf9=g%)|EoEVi^@f~`DJMZIWtEt z+Y=}F_0Q_55&s0nhKTv^!OQP{XUR7R!pHL+%KW~z@D1*&j`#J_g4{sZ9z{9g!K-+;nIm%c76 z8=yj)j|`kRo3%#sY-=Dr=hdT@+vaJuYRW%wdmflZ<`sAA%xxX>Q%{3Pxn6qIISm_6 zwu>xIW}1Aii_Q|;v%S6WnsNIwqGPe{y~+K;Ma72P@nC2hS-P%_ejBODxyhs$>l)$x zDsl&l$KSYIp00FpO1pLU6ZKfey`v1!Cj-m-j>FsD3*p@A;DNZm;K|6i--5#u8aqNfjQ!84NLrGht#Hd8n5Du=I?1 zOjGtPNt@%Px$Q?sG<`*5Yi-iAD3{w=<6j@o!yot|8r42~PY0RXzyBe3eL!ccPrcA% za&Mn=!gLhM+;qKrt}wX2WiI6oil1-B&oSP?J+aacWwiS`VS0Q6y;{EZX_uR_>6czP zHgn+*Y@g2x&zN7XJ%FZhH&Y!*I&O(#k!C(-Sn^cX)9opFQE|&i%$9u~W~+}w!trx_ zJkEJqY@X{84PkoWEY!n%u@-69QCU?_(gLmV2fj|q$U-D`=uTpMM-(zyg&g(fsE&~8 zvFti8P1>p;joBQ27A;UUtWwz~>P^lDkq&8JC=S!p|b zIZ)6&ZIH7NYgZ|BMmL%+_vyA6xGXLCwkVsUJ0IvAE#ZDHu$&~}tIGfzeUylwjH*g=w-0NiJb-h}nvFly#2=V8YQd#Dx)5;+#j@OzTBWG{4j~mFex=G= z-5^RRW)&L{QPUOLzLS(rhNC zxx~fg3o%JIaIQ;(5fP83bZ2OgVMnzbdn>z~K3YvKbm%%q9M`tXcb?LTW7DFtU7jDc zK94+?bF?o=sN3e0(dgOR-hW2iWjAU@Ht=j$- z&=q>?mP!QNz+*>$nr{fOcFRgsPtep1d{5^w5RxMcJH=K*V3?4% z(@0$njj?Fbc*N|@Ydv0DJDVTLblql?>HG*U>e9Y#yEJQ*?KrOLyo#AgKSCSdgqg{A zny@ozh#gI$s`NSJjq7SMEA3dZ*i}c-S$sBg4{w&(awj+@{&{&l`^WE7;LZEQjBw{Tj$blts8Z*&!#kv&eUbph)2lt_zYOeH$H8dpZ$%JeqX zA4)bkoIlraDnI4a>XZr;Q|O7XU^=Rf7QC^?2J-i{J5;wM=|&~nE_C`-A^(HFW2lma zm_w!6s0FUYPnnH**CumKM3!9@y@dd(4?mNhHdNYs>^cUF_1KEte3V$%=@P73E zzOH$5g-prp*YuhDXWiN?GfR-MBQb5z$H12~^_0b6mG>!CcI%8(f1Rz? zGn9z_6|3N#GgMETv0OsNg^QH+T{eOr3x=r`>n*!i#Ts1|hctNaHD#+>-)XO}c8$|r zT_MAv494Q6$8w-4MK1-k?&npBstn&fi9GMFV$dwALut~mC+R8Qk>&GY_q){U+&Q^vcjS{WU#vr8{!3Pm^W~#&pPWkQ+qE$5DTZ6Qv6VVc z7q{Fosf!)d&GU0!IqQ!rp5VszapO^Ce>M__rK_ieIg5&6d(MA2%O;1W@48b1w=8Ae zn)1TpNxRJ$hB24=KHFt1PlYZE{7npX_HckV>L6XPn1R;f!uKCBMstqXVd= z`aezt8HHNJmMsRd&)1h{47%Fc#SKQFph<>;uY%WXxeMo8v_f{=H>uIGtG};;*5Wmb zmDvg8GOHGO(0$J2iF3uyycw|iUihv2-G<5L_hpc#~~#cWwx9&t2s`ELYsK3d_?EPuYEF^ z9BWIlZqF7dq4VjD$zOx#R-#9!6a{crp>Rb1e(lVOf$Fg11ve?}eJZpgIlE8wr`%Hw zgsYY|G(zKs`#Pr87?y_Xz1@WehA+<)EvlXOiR>FP%-Q*@@t_WWrO=|vf?`T~mP~{{ zJdNT0qJ-sYl3P3D48Qj>Rri&ASHah+L8+?SX11!McC6WZuU@vpa|?H~mYL?A@qv@?GoM&o z#MrlaWU`{PU2-1DxL7~^JU4t~;j*KVG|o$-H=LL0YUji(e0r_gv*>Omx@7RS43lj} z!EDW}-$Crl5sc7g@10)AXJ}--+m>_w8AURc_I^v))ka+BL}%{X`@*&*R^0`z zx$k+$-}6dcR%&w|IcX5WNsA6;m_Ux~8{Q=rxe)E6SvNly1TWoV{dAAHb==nF?2$l= zvPM`2eIHzMiE~hrt8i&3rZ(G4gEOv&8%Of#E{uKtJ|9+fsG-`m>K@(9cw9#1G{b~$ z|4MfFYR?Voybhdmx2wsLd7=yV$Vy@nck0llqV|F$Yh)a@n|6i&b7?`xJT^^eZME_A zqLy=aRjHG3V-;l{u*hF~ZjWCF1KoXhBz?Hsjh@c9L$q_yfBIH#s~u!NklJ!Uu_ zV+R(daM-DQVi}P($Yac3yE%Xq+iR?4ga| znbkK+1@Q7l*_%^0P#?O7(9B%mA6>cTqFEj+k4Ql-CHl_&h3qDm>6zt19-Vx2s{fJ3 z{9BHy9(rAvm}=~eJmC-?XIzlsYQp?G>U+N|8rdLUA#^6|WXC|N!LmlVnuRx~?Xn=f zC{M~+L_Z7GS49+8FubK4+OQ+Y#LA~7_;3PT!~5jf;Q`TF?iu%>74)mXwO+G|Y^6Y5 z&VJulaIJaW+EQU8)9$#!m+cn%ILeS5+uNnoDH9Yk7R%-y2Ub2DP%}R7?s#ccK)u+N zA^oDUwC)l4WAri7*POcPrpeCd2wbg9@lyL^E(56gPX9}0}PF< z8bKnjy-@~zEhi3f40}wJQJFNR;T|j%^K}LvvzDy0P@RVhH5JiGbf!iQ-k3fXg$lM! zHCUr5nykzi#XeiGx#w3z*G^N^5NO}LT>EtDp@rs2riQ$>v*;5GTnYQ9cENBL?Q_^e z(avfsJCz#bk0!tyTvUdi-J8-76dRvf<1THq(3*zHbz0kNEYC1&5Y*tEqyJP~50J zyU!2xKA^e+im1;wH^}0Y@zqhM6??3pvat9_Wucy~vcOfoEL%0G;hM1zbQoD;C17&; z+{cdV$j=eDs71vY%wxD|P+7J^^n%x?)UYaPlw5<{Nb5aqCv2?P*h(VKW(hmT*>`S5 zZGIr|QFga|^Pull|g|=BM{7^$)Kp`?qyGZ@^O4g~NXchF!d+*!LZLAeC zq#sG6rEgXXqr9yPlnQ2QD7!h%tKjhHJ7Cd^dNftyDSZoSIwX5^#Gi72fSTIr`w9~R^+*`(T$_p&5q z4^m3V1`|`X6tmwWe%aFNlKb4{51*pGh|G73K}Mr^fLvo#^9#GZ4t03Q!fH5x>L zc}r)cMqXx1pf_&IL;AnKJr_V+&1>i96sEYJ_p-Jj{;f>rY72a1K{vm2jv3J|m7cnH zu8wl56jSg7i5x_Q^bd6@!|L)9tzEWMHkqi-_Ya&|vYv<04h9anug?P3);qbEK>& z!~Fe|e)L0)qmSCGH`6e1BGVamBXZaOEnJ-Jid<;=i)Bb0vNejYQWke@@4J)TpPN8f+W7ncv^5dC{X= z9bK_d6jH=Knm+^6RcvOTsciH^e;U?#!FFA*;eC93<@0)4MSle2&_2LXy z&+1g4reXL6v(=+%Bd5fq?z1&3K1mngbQm`m%xB3^!?PikL^eb@PiGn4xl$isgX_n- z*Z1G`72q7I-vyIxLFc1khg;A|#jxj&?Q3nN?vec_WZt>`VfX4|gz^DZl5ex$AM3td zDvaH_^%(Qp5h;c7Qzw-ZHuZ8`iae+HCH0FkWMTN?^rpKPulM|aW75?ng& z`Laj<<69T|Rn4T)w(my|_yF!qeeY<+ z(R@CTuN*^miEGW=t?5tCU3H_!ggDI$rl6WsK*%RxY5h%#DKF!PjqsfHSh`L$UHklj zyWEGM-ZyVxL&U4yMq%Rb{F>mZWz_Mpi=u%nYN~b790E+MZM$dEA|CbDv zwp2$=s1&KwGVw4{9u@6r!F9uAg>{}($SKIV3r16VFkD`9KmVmBF*z0~c5429+>7$C zHNH!Osnb)6uRqjURshluTtonGXu-TEcDbmD_vV>_1$)sipc|6nc z=nemKHcylhCCS%x9`0j*bMBVwmzeQyi==yhCcfB0Ie3}(G5K9K=EBY7che)XCfQFp zX0l@2!;r?(Ip!ro2ZliA&NSDq3C{(7CucbBTHY3Em)AZ;RF`ly0St@jTtA#s(}0{Q z*ZY|ZT(d^)(z?QxWb&~lb>^xVP4S5jS+(~?>?@Y+NpULu0F=y&Welp@4)|F+E-4k} z;d#PSj%%_pNIXyY_-VLE9E)_Qt6Y*rr1+@YXpt@S+WurPlPlTQ+Rk@u@T8W;+fpIA z-~*@HpANz2c3ZazOfB}loPGO5Dc|bj>)fFILV2gJeHO+|TOsZGV)kG;I!S9_aD(=H zbZfhkcN(c^pm&Pg)7s?0ZWf`dfG=5aW<;jbHD}+7BUUSX5uCC1_f6{DJe`;)I!5CC zyzN7qlBX-%UC^-C3>wYOmL3|F2{Echu*O<^C96%lP{{a8F{dU)UzYZ;jJVr2n&b=I zYYMOSkY~S_ve2f~kwP0wN7D+g3Op5=R8&zUZPis=ewz`5ddvUX<)yCgv)Iei7xr;! z^d0LJebl~!ebLr*2W4R2D!ijh;Mwv@Y%0FK)75rrvLMFTXZBg~K+WccSe+m@vk5B1 zj)AQ6vF0N&HzMwe^l(VuH0uk+18~xX@`hAU;<5Nz;z6-D>}=pSFT}4JuF+E`WP-q| zdGriGQ;N9|0h&6#LX&vmF-XcAsb(QXfxoJIe^vMX6xF?L$_azSUVy)f(fF$MU&Uxb zFA9-L<_Sv}%=mtMVIULH`P%AZc+B2-kL9NyQSSMt+?O2|SMUxw!($<3a;6eGcdul7l*g7v$C^rglg8HAw?JMIo z0o(Q*&$)lfId2!o`5X{9J1hFhQ^_Yo9HhFH7UXl~FgJzj`&`vHUhZ4_`;@ zqT97~2jj@gMf)!I#jcbSoZX1Idb;e$vBo{=75=t%`wy#^mnShRP(Qpa4es7jsv*!C zFIZmfObPBz`K-pQV%nptPHR2BTZkF)sAu#^yw2?? zxbf=hFw8jw@J6#~ar9;&{B0DHS#1pG((FuBh|nt1E4fE!7Z9i@5lnxEH3WhUeR4C= zKN+TA9yWf7Y`Xc9VN}OX5`>|gm7SFj1Z63Gq>oY3(=AC|B2(XAh2eA(a)0u4r+(Gzn0 zx%$`^(pIQ6iI)Q@=J0(MesKu&!X}zf8HjGpTW|l0U3l|rO5HmJonhPKYV3m)ehtI+ zl?Qf7y>{Q#n;4-pX1#9^>fHrm*j39w5VPV}Q&K-I!=4!go`d?tqwXDwqT{Re zQnWq@Zlsn- z&^(v${0VO?E<};B>xJ(caG!6M(QeP~c1y_zEg^Pq`(A&v9>>;GlQHx3QpCNv9jV!R+^mb5vb+>+19 zS<=Xr-Py-)J2_%hLDvrIfzitbWbs|*1_llC=_I+pJ61yy zYgteCU#JCkXgDe%U3L zp!gczFKYlPamnA2Q3O&t4IM|xMW%)>zz;YVcCi5GhBB`JRK zQVFe{k2ibqlD*qqp=@xAjucZG(84$O7;88tdC7_vh;CH+6(2QTdDgBonp_9(~^kyYyq(Ia2<9eG9|+vRc|J}35-sF??AtDnXTqf6X2S@;CPc9LMp(SOwva7&cqZ% z8_HI;cWxBxgKN?Emp8vA@1fRoQ>YDJ4of1Rc_j;hW+gN5b}-xEE8zG@Uh{*lneE#0 z!eCOlRd!eVS`3t2Tzm(~XCV;{8o84sV6{qjr1oqjrSXK2F+)hm#UWomx{wN?5%$hT z(wN-AaLrCB>eBpQ(Xc}NjCoabmt7MN4xSq!{6c z8r)RAWCKm9ku7?4R9+QeDguZ0_R_O^!UFn?Uoer3h1~HJKM7F0A#~sTuIa|)9H)us zQ#L)t0b4En>Sk_Z8JJ5ygJ}J;09)S@S+WTp3SE4FSi&iuA5q>;)1$ackU0#J(a7h> z@(jrR(d&8E4=2gO!^1m4MxdQ@=OJ{n*t195QY5be=W}QIA5i|88Q^4GZjv`KlOW8L z9m3Ay0DTpM5A7j8xD%i+S(Qy2IM3X*oI_^qW`sXvKQlmInwvD`0DXb^i39hRDA7#! z&*CD9KTLcJE zXVZYGotHQ(fcrC^+tm~S+=r_n>{{c27@w1X^GI%!Ul0Z7ff)$M0=VC;ZCkSr_W@Mr z&1tUOg~XIREEDeDNwTUZXXToE5edZj5E#HCU;z5AKp|iT$k?D+2EMNEH4BO`LKhA!KvmVU8h8+XoDD8gpC^Ue;&QsXfU zHB{_VLp*Fs$75;mX@t24mJw^pL`rdqO?>V0ui{X8guu}0Q^Wji@-4Pv{TQtbouwDE zWMs!a@dxVXk1hl9W6y!s*_quEwPUoW$Yys|&A9Ol)w{=*7t9E7&YB-E?yq07Nf%k= zMF<9a7t4c$l=wD@|6aD0^R@Kehq8=7H$0nVgk&Y@J9;u5jU!`jhwc-#PZPvylJ za~-+q+DEJG+zwr2mv){*^+b1J{z^L*v|^9FhNCAxPodG;`+8%jsYRzfPr1z}=OdP8 z>p+YgNvvQ{i(#%d-J`=&jlWfbb&cAMG$jnCh$1X2gAZuVtURyKcJ9hr&8Er3`L<>@ zihVqQ!>&x_JC3*K@|!f|fS%DOD_yBelDuWexnrws+twmg1Bd%VU{3fFr*K35or8!Q z7X1}LlUW@)_W0(eN|TOsZG49vr`=-ZBuHqtHD=DMy(m~{*Q_ntCQ|undFk1a`P(-K zrYoxw_iZ6T@ybw%GjAf%spJo|ivgG0Y6fLQ; zX-6xhAf)v>t6m(z2xVoUmdzIvu%PGo1!6o2-#guF>=Qo@5_wLsE4mY<*Qmu%WGf(z zQMdLl0GY!i$n5P*8sjFS*$9nf&Me1aClH=fuuqFKEB&imS3T-&V&UePp+KB6uJx%( zS0vzjjx`SN-!_wdt=oN3I~$xnc?AvnRI6qyQ|Xnim}KcwQuzyup_^tzP@tQ4@+L>z zl?G=ZoXN7AyR7Gi>hJHcW$Hbl?1ADjew(D&MG=7OrN0oL5?TtaM+wuoID*oRBL%rQ z2_WFa&sl(tgVuL2DRPnqMj+7OWH>2?b8SrRxifZe-X6o2Z4%YEgJi5RxZUvN0l?!O z&@_H6^<{GEu1pn6w-89p=|lvbKhK_Z=M|u3tx^HMe=M3x|b@riX(4Nwwrkc8N zEJ(dRfalP0@3!cpfm-Uk7Ly)t%CieJQq*||#%^NTRc%q7TgL2XMOG6grEho#&_2Kh zBp9=l17D+WE0?yHWSdwY!gw%qOMhPBB2jT~=?}S(53mE-OvA6D zJys?rHCM14T1GtMQPiN-uX8q&rG$%~^TSz(0 zrhgl>v_#SA-N&;<*>?hzZ2FSwK!5%gSSG7UW{J=ZNrNgFqo@bHECN(GKTYOOiN&eU zAWzixh7<%FolqeAmVCoc+IKFhUDWDLM4FjoK|g_yLSU z7j^R-p5}Ai!9mbC4E5yNjkFA%!Vjmbmi823d;sW}E;5e;5n^7S0oxYrL3`>aCe;b3 ze&L6ZkT>Y=DqPuk&wTVc{b5eckT#X58 zsu({bKR?@*r4p_Oedx;U=d*cB263)KQIjkKSe8Dl{I+qJ$g19bmZcC5@wFh6N?CF; zQjEU_K0?Gl(9mLbGgWrsAmUKhDV2t+Zz5D?4Xo7(!H%Kp9aCoIdw?suyn5_-ECfq) zJal$C88%27ri0Hk$&KkR0IuxFp2+!Vz?H?^4SFz4h;w@eNl0m~9BL{_qS0k(ia7tV z^nuCg>5~Hm)?Im%^cvXL=U4BtG+l?)gkHVJ0HSAxR&|yAU`Z=pn0+b%fhMX9BZlmK zV;diLT#O{q9Wtmsw?Ghg>mm!SxcGLG6PtN2()f#$Vx%4%_LY2y-F{Q?Sr`Zs!xD`2 zrGTZeypYSS2k2FG zce_SFAPursi_%m9`tuKa;{Ztw4Gp_%i<6G~Pgq7q(JefYwY>Ng)D1TJWPUQ2_;

Hj&>Dq-|{G5Yj=4PPg&2N#OAM-Su7PQ?C#&Nq4LA4o#>$xa2dwGIlS(r#`se zz4vR1CxPT!9tcxPH`P!^x@B=>+5sj^q&@X<1Yp8axT31ri&xI%exOnJApHcKx~b~G zA|1cv19d#LDlgNs8RSQ#p+VfX#4PVoPmzT4H(B0QQQ!+JGYt-sk&+-T`JcVXBz)7TDz57e>b9sb@VoIg_b53Hj~wKBrml zVhDKBP0Y6b>VN=BotNH8y~z5RnD;5J(cSivz6%nVn$3p_S%Kte>yD9n(E+W>N?c(r z7L&r~73`H^oMi3H#Yy5f#n;KNb8^MHa<&s61f6?|z38ahcUW9Rs8;BQ(60^b9tUc+EN2n99Tf_%w$fw2p z7(9yy$liqZ1vj238!-bcQVu zSaG6XmT5A~A|PyPHz>dCz7+MA0pNq8?W~m-D8ERs>odv|$}h>9!aGQ}fHZtBP1J4~ zkP-sH@oWNA@}@IMJRld0;wFI@f^3uzglpu6fIj5kWR&-e0I|=D^@-Xuq z2|nZrJNwAsDk$XWZt*{-7Y7mJ(0s8kunqp z3lVz(EGb{kGc%}z1pe6RRB5R-eDU)M5LwcLn%H&#KtK)fZJ?g9?I>-(GC^fhT_v_O zJmR39;aT-qI2sgEf`YyFfkFyUu#YPc8G-@=!eqoG3iUfkneDi$znXh^R zm66<*vlRd!%5Y#4Kg|OyUcqxfWJ^0C;(!)A4D|0QZcK{d<5x~fAy~`9XPea-NnZkM zscN}lY6ysjm-El*sTNFtW!=9H5sSWl)LsKn9(Xlyff5E+@|jyw1kbnKaa*NCF$Ab= z6KhzXDJjN4;N%u#X7M$=f`;+!8x^i>((NDdD}~2GDF&b zyzUGNe`EB)H#|uIRBbtPsJxkMy7I3@{nw)Y(W3rqA^)|I|3+8DU*7o-<()mZKnnpM zXx$ihdJ-@<)_`l_OfHzI*l!KWq4z~yTherkJZN}fLx7;M0W40T9!g>tx$sD{4Ac?q zGtLGD1y4^?G3wsw-qOj{rMbz$2wf(|w?vwa$)(5Zvb}oRmMz?eM#QO5yS>Adj}p5W zEG5JJJP!hQa;KG<-VAts0cp!}skQelF_uq2(KM;ICiOMB>GXw3>St!E=lV&a=}L3W zrw6AkMJ}FN4?!p=j&s1sj{)zmM{YB}{WJ7Yz>6ZstWSRK4LKx}Cs zY*JNPCWOfmj#^5(HkufbM_u3VI0-~E8GaVaQ8olP#2`9kEY1@xw7tX@^|ZNhxWs2$ zt?f#MBqB=^;d2wN>itoC$v`!bBQ-O6zvL#?;HewbVk35l;rq_v@R-55ce(aYPTBT( ze=KmdkATO@GJ zdr}kqV^a7C{>?o)qk@W~G^W=9&#{Y-)(|w3JOYg*6Cjk_O-j~6(euc`w+=!=B^CgC z^%;701&;Qs@WM@tTB~|fmz7@Lv9dVj?A&Qv8DOHby)dn{U|4b<2^I6lbB$^U|Lzce zkMn0uCFd(Kn9|kW(rfWjxB!0wsp#JyIDE@>2~diUs$gsftw%6g`2K(EJg}F)eNIT< z!#49FuJaB?IGV0tNaBGg3DwFuYjZ8|8mPl_5`fqRvA^lrqoir#5acD#hC8)@vkD*~ z9R>D(Qp(s9Eb)byZ7E<^nqRkVW{H}_s>Eujae2X37+{x*W@w_f&GpSgXBR7D#oXPb zUsPnnX0r{VJ5on z-FR51*fVr$q%&hZrOq6)a-65H`ynAg@M5K#d{8pxrtMy;LOyNhY5L&97@=7y4;R^P z8u>uz+hwU)jfKiSd0t8I9xJ7XN6T74e$^VSU&eVGKktlGe_U75OMTVMhx7O745I14 zOBVe1A3uWMH z-|QZki4S;f!Y~5{j?Vaue=SA8^euAac(QbyI&DUOJlGP+ya;X2_8tl;ry%Fc9B#n7#UZ#wJn0Wyr7N zHQ2#pFS^)i-^CAGRdO+w23ugDGMNwC)QTg+iZ*}0?- zxw_v9Ir8loe=TfeROxji5}x;EtxA1brBH$;we>9l-0aG= zAI{r4zffdE*G|qTWZgNUb8SL!n7i?F>Y5Z*G*fsH!#7s^4wl*kBWuEW#~L`jFt$bM zF4d^z@rH=b%2u``*=pk70rWC+JznI5(ay0HjdduibZs~SOIv1RH>s#$5nrorzApQL z9n^o@f~y8;ntC2#bERBxBk|fzFz3j&)aj{+e*3LBi6zk9%DGs>nURB?qh7A5cT+;O z=axPM8*&i6h1ocFoRfU-8q!6Vsrzk)(`0C-I&GxioHvRdyEwy@x_a#zvW~w|^zG?S z;6-UZE^Fx3NN(g(ZEaApjgG@~kHliFF1BR}Q!3SRdKaz^&kV{ zNYIg)RyKgKRdL4Bg>HsMueC71UvavH)9*d^c6p)!#n@PhF#s!n_WLo6pbX+Vo!knCED* zIMi9>8l0i(pTRQ&e_dLA4`!SNxA1XPor2 zC7>7sP$VXgHVaS#;O%#3HUTf(>UrV|O*c|(ZZEhb&g>+QJCkr`yFW+t8d)=hovGnTjl*ACcZG90@@&-U`^XpLxAM zx7qUo1ubbJ`K!Dm^YQ90G&1$u6FcMJorm|%Zof6X<+L>`#?^QYg?70zaB;FpS=fDf zK7h}k+iwSTP$2b)kU*dCaCxT2eXiNi0vig)NN%l$iT(R}YXa0cgvFkyQ{ByRZH7^7 z=aBM9vjH!9u(QKuGjb@VEk9}<-#UIy!E-q#oICR)pf)T=&V|1PkMMTZ$cg+Gizjy_ zXal=M;5IIH$!XCfi$7>nOh6h*ufma<4-*p;Cu6W!d2fJ0pa^pdbP(scv7$~sM#9>p z(^IQl(eHCunz?&1pa7$)jg4XfTZ2bn2jSiK=sdcnizORbxoHv~jew>0;B+d1_Z< z>4IR;_lxoD)II(D13xQhZ3c3sZNC|x(Yvh~x<|+?m{Eie6GbNyLpQ_ExyltGT8qcQ z`iGbk%9-y#C%bzs^?qu#Ja3ag7poQU4wg#Lh2L>{t`z6Z$+-+%kg)Zp#7U#tbCO`* zF|__?V!*VcLvJ$sfLNtuGplGA=gSFtlTT(okhaom*_OLAOpX`TYt`VxE3^3k`NQqCrTrM5@Jo2Q~q%6Q*9dkRh&K#-6rpaGOy? z^*C^c=_k&#?*Q&FSoO?Nz?9!fp}PH^-III5JcHMMz4Ilg;SdM;Ezq>Zvy-v(98EFw zkRg4{!37Rg22u-u(1#*>Op*??vEIDrTf7}`oxbI2ktf&tP$Uh)c9Ko_z8g?tAn?D%16a&R=8BR8>@AA(d0}9-VONbV8URX; z_l;aO=Z@l3S@PYECp^{(&T=C{m$NO<(TBp?v4-;)ah z1Y!jlhqf)TcdE;mYJa#vX+)nP>jN@^=qR@6A0~SRXt}1O!%G7PJz;h&hUuX19r>lD zp-#9tfAkQjcsMqJhBXR6xqn%(`^bsa7<;AD3zrt-;Tk=)+pk7RZvXOZ+~)e9WSMD_ zl{^&_P+QJ0M*u!MCCH~L8Mf_cSbr-}NUL`g8qmpx;K}cv``X7WIj)8r> zu#a0^Kws0JP&wMB?t6pfK@1>0t=+4S2SKp1{gopEfNN%eos}vE_R&i6HB|PxM4FBE z$X~&~?aplV9(~bD*HmYA(b~iyC)dSc6Hzkromb-%)ZfH>j^1HtMNZBJ_*PfdfanocOT7n=S+d~M$`%8L!aaE)YYyte>W_Qu1omxoDvqv${m zHWjDxhM)6HwG_|3|6}jH)+sFt8jeCFl)0gIi7dvR#U*98o9&7MR^kL zuF25Tx6DbnBNEZY3SpxCNtWX>s|7NSplm}be{xv&>4~bSoD<)~OumNhXBgevG_8c2 zQ(A9wSzhlhaq15K*+b~=apF!Xx9G*6s6}wbN_Rue?fQvfWRq<+S8siCGLaNiJo50(gT1JyR8x7|dk7OQrTP8$E1!{MoX?j@ zJGx&~^xh(+Kf_a3BX+19WK430cZsB!zO^D56*iMfamp#U{b4LIOWSc%+w1qJi7e0; z779eW?qjWR)7)~9D#aBdmof#kc?4Ox2v130bQVAz&l@KH8|Ytdlojc6aIbU~th~}5 z{m)8XZT22e{>oD0zhRJm; z11N6=n4e&>y}8hw(Gtj88pxEHrlz{I1%B`EmPdq-c3(m(Yd#MgHpO83GRVc>Mt00N zq!R}$dD0rj4;m@TMwF6BJh)xdF)mN7A5)_v8MWqs} z)D*mS!Ar0!NFk^XP9u!q(MiD5Kl&M>o3W0k*Sc<}<6Dse(?!o&GaUn0xte8t{k@f_ z#}rhB5E&Id-b+;vEg574(2;5N`fmWIL!j}9i~#NdHQy9Ijo9@!Pwv?pm0hUY$QvBZ zSbGjQfc#kkf(9d+FpaH%0c}k*Rp18{Y9x%{B=8EiAiTog#0S>9P^dU48;}hhzcclG z(h{(c3jg;mFUOE` zF2eVsIL@6l0xE=Tn5qq2xo>OqiJIjQ>tB=1IO0KyDx+)gTLR(2h@tJKApSJsD!zq&g@oIO|x=#Q@0 zqh?)*9x0S=$zA4tYP$f?>P9V6sOZMcGO^gxREZt`C#_Ni&?-G_L&e!nmNm~UE-L3M zD+I%3w4~PoZYVGP{nj`bZv4Hl3Nru&nTgn$>wl-i$W|PJXbm`ji&zvd5o4gdw9OYE zS4r~Q=1VFft*(o2A~e2L5g>UC~<~ zN3o-~!RcefguFdXtvszna@HWf^okQ`g!s$YI6BayHG0$8BmJJ_mo3tFPE6oqjw_UZ z4H|Ai>9M51tjXEn#uR(FEFjeD1Ar;*ZN)%8K?axDDJ^q0&?yLS0=)R}o4Vynj+85(qJHy?m zgK($beU3c?QMR^;4^=Ki*&qEHOe53Q@75zGF4X=N9dNgPF|DFbL7it zLry0~wKnOYd!;Fn{O$KXweBihakUuEAY# z4epBZk|b$Vfljw_4AY)O{@jYtyad{t+=!Ao_%Igir2r#ZleNE1czu)x*j|cdOw2B z0X`_p@WQQP?e#dTefGp&c`wtgqGaAwX#O&b=MnvwVa1&%_fZLB75d?(>qk+S17}9r zZzq}@0C`iKTa*7L4y9ka%vP&^o7Nf`ASb*K(DRJZhdUe@MI*cvrcUEOPOX!YE;lV+ z-RWHnmnF|`Oau3&H!SoH*9&V}GLQlO?nyv#2fq3jfOA>uw_UW^n56}G%^K_@!;`yY zFQLhI*v9b>h|Aril5ai*18fn834Z0}CkXH&YPKyWWiNrugv!ak3Dq~J>7e3&I!%`j z!+RdG1vUF?ZS4Q!wb{TLo~)>h=h`6=7ICo(`+^DvC<4i|=wa60+fe9Jcqfa+o<)LS z?B4{zo71HHA5YT*H7|AVJ|sxmSQ{(m|8#9Op4C&>{HcsOxiI!L1|hUHGQdzcFQGe> zmw=qc4m$*m)`vWS*6(lfgpDmM@MjDE*}|K7+@CG{=NA6wf=PuDaS34aU7DUVG)VE! zR3hh|NkK3#hS{1nNSX38ydNXfSTCrNDtiZ|JqjZ$Kf|3h?6>jSp2njiq|o}v!*>uK zz6vSa5?A#e+>*S@($wWee?j**P-(+k(X;1~JEY6+L*Vz4@TN%`QDuRMgfcWAOz%S~ zo&CC|VI!3mIUb+DdL$Bvq{?)DrI5I-X9%UP&2Jo&IC+T!iad9}Gn)FyPATD4sB+U@ z5f(Z@mZJ?P+UV0pRy>m&`j;&34EzZWgM9}JpaszJ`J&0-#>>a_P+bGdr~(x#4)K6k zR_xmJJNEEb$aow{vofTDgvCa&o-qWNGbnMr^JU{}_>9J(WFhj4Nup0wHNaIP^<$Nx zCOdFXwDlY;A3}HH!pXky&Hlnv9MA>$X<#N9k(o3y&|kxo7D-)@x?|)pr{%obhJe4F z1$png2j9~qDS-Dyq0i-yAP8dp!rxSG)@?9HIeTRval!XRw4;uDLl><#FK0^TsZ4=f z$+npO{`s%ZR|XPq17un;(v2UUlbFxkE?E#AoVgZcwNcb>3=#Tpc_%s=mR0Kzzr{7S zBTX#5d+BA?diQiQluvDi>cVIKQ6UQS6;zqD?jEc%l%78B6^3dXl_(iG@O;yO=W8yt z+5DI$UgkizLfb)$o0sAmM}CZq`=>r`ndRV4JPuc*7fXpsqB)yQ{EjBa49lLug5;<_ACGM_>=$r zq1FHYPOI0h6WHjElt9ylzgCeD92sOmiiXvh}Mh*#1Br!6?r_Y$jYfe(|Gl9xw-=~iz1 z$aQr&EF&}tFH1?D8sYYmnX}^Oe2cUR_Di|u??6C6j+)ZTX{&Gccz(|X{2-@7pOSi;|Y!N<>s|1EB$g@_wL z4(d5nP8hcpG`3*lGfMH6-G}^5dG3^GxN7RY1pd4Z{pDj$ZE>O-iJZPm6a)GcfZZ6$ zL)lC*Rm7!li{(S*B*J8T-9v>@?0=CtLcl<+6hV4y%|PB768VpFiS>f|;o{$1uD-ta zl=d@NtyE-uf1Sr>P*9=u*$D72w*EU&h-s|XN~KVD_ZXj4=D1thzq3{CqDc+c_8Rm?091tldZ zc(10%7_*Ydh|PJ=26kr2)Qw=z0gFtVPnGKNRrnsyk5`%AgHYi~*@v=?P~pbS_#DxP9kGB}358r^}3}dA*=$XXD7+;v#c*y_VZj_K8 z(S*G8K18fU z!Dkdg6Tc8t1y#Ac!a1_#uS3eNmi>OTA%3HS^SG#v5|s+5g*%lstn2_`rb_nDP$3|n z5eEWTagnz#`2)H~&Yz9{6Fj{Z%bz(lHx%#$s2aPS#9Ao`Y? zF>vr~GL$u7s_Q}j$->Cfscs|O`X8{Im+Dvo_pmq*H6^(S;@6&=UzXWymOym7E=U;? zCMqhfHh=$p$Tao=f=Al-j|j#ARt?~!#vt3gE@daW|6KGcyU0%0Z1pso^is%0J*SrwtZFlK_SMP%Iy~ z8_^-Vfd@s=vtrU~Hu{&B0^Wgu#OuCScIj2%MOE5UYqgB4b;J`_MkhcrVlp>psiUoY z&~vSy#l@}4>cj2QYPrv6aRMW<4b~8~SdS{r2}40)?{Vx_i0PIhZ~om~VNLwmBUgvZ zCyiw1IX4C`y$@+_7WZ6Ra&Y{fTRJfkHrwb6h^xb5f@`mkju5Hbu2-%NGu^*ZWIkgB z?~hZh2dFSjcyB*ohXA(0dS%%1U6aRO6hUtEuL_NWrP1xhXSD}LYnFz z5=1%IG5Mmb&?bA)cPL=Vl7hsB$A7Fe{`@`f(NvK=Wn&OgAVE&Z@0^N1uF2BU4_{rB zPKlo_x>W+(!r=V1mLxtAAwaO%#b?>MCzx1OKY2zp`sC%>36Xh6Tg1*0Smk+9RupCq z?iNeAzY#ThvC3ZV(+AzK8*q4xJufP{E6RZsyk|iQZvJgf$^-C|!UsPC_oUf4I=1b* z{IgyYMa&g;&%hnB%tV#!4%c+~DFyRzxTYT$58m0#=p+-mSo2xog5TvSzp+ zRRCg6BY*e3MON5F&}DYh0+E%IZv_(i`r+)oT$3F)gqC3s(u&RYCxYeyDmYvcz|3vU zL2c6}gr>62)5~0^NQU$UOh@1-{v5Pe1{WdE(D}zJTYU!<;6^xjjV-eVu1gTBJx>Ey zGevA8hYDQH-v`?6bgR&U_7=3|b}jd%OS?H;Xm1AV7ty4R&Ze{t7c#HF>`zE?FE6o6dt;cYXdupv*vd|NkR5wwB~ zrwj8u8VO6aG7~dFZ3zrTABB^o1l&a=uFxL=a`j#yeaFLhh>yDCWw5D0{6-CpX3akh zW#%hLnR&@~BvZv}rSz^RiC&fHOE56@D74b30{MY}n(6-9`|(>`x}6MM5^N8gFLSjo zcI&Nuc--m9>LPUROHKu`wBZ$_jIAszvLq*Q9aN%#WEbOlJ<-?5qmiiWBuGEz)R8^@5jdsrxlQ+rxf?fwodh zeEe(Swx(UMV)dlmRt>8j%<^5QDGapclGIy_KZ%LU_4h+n=DbaU-z$0FDpGt>HCzeR zu|g=62Z)ozd3gb*NC&_$BLAXElh2zO740VsAW_g|{aguH&9Cn&Kkq>NV-YE(Hm~*< z7N0`08T+z1*8#ZAqEsJ}=DSc7!H zDBncSN&CJk3P&Jm{TR|)a^aYp1pOku6&IPEPo-{{!FwSz_#s5Gk@EiAm9LHp7;<-7 zxnL94{9~Y>;V&x-2qa!mp^MYlMCg{_7%995F?KrU8amXf%49eWzq&K2&@RuuyX?XS zNWj9*Zx-(X+ijh=R(fQtfM@I|H^^+^E%#+6_ru{dy&hF%&Hm#+nDt~%ep62VuHCE8 z&uM&>JTf-RbJ2ZT(GeguM_Y69YfH@Z+t=b&P5gdg0alvY7%wR7{O-61I&o;UI148t zWuOd=11^v<*ieUlWiGg>D3I_MGQT;H`SQAr0~B|3nyz1gf``HWY3W6hJuxcwJIx!wZntM-QTojV}&Q~g`EWg3z-WaP2{ zh!Ymo9^_NmSksiWEr$D0m9VB*E=+F_AkY-*k`FVEXg(a^kOI*Jku7I3&Oj8N6_wz} z2LgzEprq)#SrbKf`fp)tmL}Jr9Je|h@CmSw!()tlHN94Do4wl!84Ms#0BD!$3Zyx2 zz#3n(OldHP+!P^D+5Y?(+K=jc+CKol@)rQ`dmI42AemskE%S6%7@4#Vw*-X&2&?OZ z_UN`v)+pxXj_q=fJWvU=9c>SGJ_c0dr>{c7a6vFMMeN-uE>=N6HI1+rv*4(-D#Ye- zY*srXH*v-)8wms8(>Vi6E@jF%MMh15U_#4Itymq<>`H@4&-EfJ&jA(3om?9WM0K5E zx1a1PEYOQQp}iVV(a0z;xe8@eL1a1Cb<%?%p6f#@tvZlrWI>*yviX*AZssXbK1WVs zlxRWRLv^>)8!g-j2X`-`fwK3JaQbnG&tTHegPxh8x3VAO>5TohtyYVL8o{DhjRI8Zy-5>A6C}JMzed6Cl)%OA_Lfy~I!ME)m06tF}3O zwy()m+dvCKocHRs4-R^;zx3)ppU-bKc8rYFx`INSxV{37(hUr(8{l=3Xn#?m6r6nY zA!x)Xr;l+6B#@QU9>(^DZsIzH-U~DD^nQs9>zwiP+hB>%wi8>(6UyjB$M5U z$q67MW}bS&wI!Vx7kw{{nr7*POZlhn3-|+%#PoK zhfDRViV<|UdC#*}?o-^nv7)buGoq|%zNa<6EWgSCI^k8*CO4SL&*3y>2sgSo?gT*R zq)CQU=0+kcmY(wW=ys?V=q3@z%w(*}iSv2!P%qg^t4xKOrTTu;T>53hPH(8i|2X4t zCu)WtDi(44Az{)ZVOOYhCe)KQ@w!Ct0{ z7|t;7iW{iWW!II4JAP#73@C8v)(L3<_iZd(>u*d71DWqojt^t#h~qHAi~Q`;hvVq^-lZ*xqEaX?@>9{ z7_#y^obOy!Hm?^BiO938S`XlQsr((uHEvfThzbf(oT3K)@}34@?^q+0-XGg77avzC z-Hq70n|uPqvsqaq$%e!L7S^h;c@$_G-yXrdQ1*(FyZt0m_Yi7{ola#iRu1@3a-l*U znDZdYe!b&`#pRAA<#UP=lgw}^na{-1pzPrSVU{^nzq$X|>Z264c&EU2rmb%3tAvV8 zFG}>lf%&#q6>0YY)A`;fR*?ygBNIrHJ%MgPHb{~&{E{RaUza4SP(h+HNT_{Bcpxg{ z1B}->z`Ynj!9W#)Qck2`5C93;p;XSzu?rvt1FQ;+okKQ24CHP^{sCs*3K#6we_+is z0c(DS=jPi@VOx=PXz^KCkQxxEREIJV*Kc9l5xCVy{zcgK(Vvg}g%bZiY+{?AhfA+b z$#SW{yScpsn z!*7A}dn33fT3s&(ROC@0;(rT~SRC(ctA04@1eBoytG2_yrCWRew6hP-ujg#b^0oY` zlw~Nyf_8xVb+K}R#%@&WtmS(#C;|!m@DjYXD?~Pgx{4h7d#*A(dE`{@AO{|>Ol1=u z(e!;kATH(|<8c)~ho3B8EfJjQX84KF-e(vV7j{g7A!0n}pWdj=nV*TL;}v-}`?an4 z{_-)K)tP-eZ3w`H&M~iOuc?MkrGM_-%f>Gar@=GfFlZJ?gEDoAM8>=N4<`!_!leMl z1y?XH+&ypiR<<99i!IwJ)ll0^fJPLJ#KPSE5DTY(T3V{1MYi&tm29Ar(P})>W0{}4 zm~TA7BIDVWAn9UL-My381Gww%iLhYf<$K1%9nk|?biKD&|$2mbJ1UN zk!v^gv6fwGq;la7#09=>zQB@uCIIwg`&?Xf+g&G1MsxKVDJ362octg;dHyrwbpL{h zqj~}?%LC&6{eippk5sNfFQRDV1Fcw(AmO8+Q*C>xQ9zu~%jp@U6H`0qcHxy>^6Dyt z!7+t1lZ^J$XVn8ImOhObs}GaG5zD?p-W+26cui8qoi`8| z^|%6)wI$oq@t5$m@v*%#-#R~n2UH88boAv?grl={-H}nNzXeqir2ax9*WUR zdTUTVa}RzV*9{dxr*n#kdWh6)!_oby`tI0sKc<-Jzba)|vosvlpecG9?I9gNx*jy?xma2csR?ejsAgpL z!l+GHa;el>``Xj&>$6vXF0*<@)y&MzJ{!9bRVPhzRXcZjVtIJy(-j_0_CX(68(2Xx z`!DfP8?kP6!fq2A%$Za0Y+Uo9veAbpkI#mF6xho=6qqU+^A;ufCJT(U{*v*XytW+I zANLFzQp2T7`c8hF*x}xmW9NdtW37MoTH8&Yh32E;^xcZjGF1a!nvBSlb%xCsy`3rg zI5IQ>da*%^MorJBiV_{BJ0!)&>TMHW;YMXHfql4L=jme53eD-eaCFMQPF-y8weaPH zE$ohRti0azO|~lJLWX*qPyLhU{BPtgU4(aP5HQnD5L-Kq8r+&|H6Ev=#-ov`#+fPp zgbYuTZ_Rj8^L4?MFw^Q3K|<{@zVn3T#{H{B$!53O@r!#iN(xevG=Dy%^EAoo$$16R z%5jZ}t>3IZD`iO_*A9d|5*7ttx;8&|(0T8y{G)^LxreB4YZbufe!W}pNXcs@m6;W* ziJa=9vrHv4ZwPUuhS;Mu(oJs&MYnd#g_!AsA}<&phj^O2!QXyE=zL)6-(#MCo&H2Y zC63;|dzw8??w`XC;IFyw3k#<;;+^{yc*cW$F6csiD z%0k7lw{mO0?f!vs*`6ZXq-T3v7I2>@n}!qjwDg2L!S9MDOo9e?bJ^TSweI`lM5~s! zFUE!BJ2}7N_jo+R>pQ(#x}Lc3L#D0L_^sKCFAwRQch|mgDA?0SY<(@RXj)mJY!MUfu>)q+Wvc&&+zBfgz&z<3)`OnGW$J5)Ucucmi^rJ#qso4E_wyI+-VPnt=UQ^x z?ChM~xJ294iDk@-vDD>Ep`++Q=+F(pR&ra9OqZ@HTx@#AgjRrvoE2gaw*mqeU<+}KvWAnFJcLMPGj zz-97V_tTanHEV6L2v_U!>tqtV4Zj3lPWcLnqAl5>wt1{NNQ@gGzEQ`lK+PZ5ivCBV zpu_ztz5(y>;lXk~Wd=J1S<^ zE`lmtb})*$^Kmnfkh|SkM}s1V2nH?Nhb@GPqtUP;hXx@y!khIFY`8-V5mSoT%tj?o znn(Eb$Soy63bx7SAmwgdw9Pe~hTZ)1yKQ)p`tG)ze3v8bBtoOD=O_2Uqe;UPA$Bf; zuf|i?=GRL12E_49oV^to_=r_pGHRgNKI= zI(sB#yY8y-Lg$d?QeN`YcxlpdQ@pLm=($0JA)M@-ot`|ua!=$}XOptX6mQu3vN$EO z^PpC|U}uWvv8?qx#9TkknQr#vqbo?lNgv*92HhM}kg&G`@k^dPkl1Xtj|!JFd(cx! zhL@9eU7pNn2hZdg-=e3L6R~s?%e#q=>|GCcFGw&f{Fune?>tl-cwu$V_(iJ|c<`tN z$jO6eO=WhS%Ef^urQ6+C&z&z0yoq1xyFJ*PEMLEi8gF>HZB%Bd6PLLC^M(KYX_2Hq zulweSMUj`oIpO)DUA1;*Q7Mh&8H3HTbs@K)YBXW<%DY?@v& zh7^(bZxbJ~<@R14y1XOPdF?#R51S^J<7SQg!COP9#Hd1lCllbV9{|63^-aUU9Uz4n zz3um5g)Z@~|3;TM^{x;ZDjSe&At=z!qn4y?5u_ zJonKzSgqt`8KGxgiDqNghHH^WqIIi00qJj>L9xIT=p`etMWG>V2c3s&(9*ebWV6z9 zsnm`Y-N6m-mq%!w-Eq+Sf4H9=_$#>eR`a@K6nGxcB3k6u>Y^g7@fmi-a{ z9+C;PeiU-zDIhJQ?U1=6U_}@D+fC3sZX~#b&l)bkruaEad=DzCBX@y7(10|aZSf4J6dPCbWYbUWg} z!0q2qVVDbTx70($aXVsGtN_Vz2nIk0sY!4cC2D#vEWY`VmIG>gvRNr(%s?hur3LCJwF?w=%B*@E6_cmDUVnj}PE&A4=IUlA$L0Bi@^f7=Ih*2{J z%7Y_=bPE4JUXMGRQx7NIYu8)SGQ@tF2BZZ8GA>sWUHgK6WcWK9L&N6nAoNv^!BZS2 z;V=*+;X3u`8BWq^1*m*tGpq$K@mRHGY75Y$WR9ndjYx$<^w8L7%sHE3;v$G! z8_v$>;ktQNyXPrG$L=qKrivlar{@^@y6zjB?m2M2iu={RQKwk7y)S5d43S%DvqO|( zN=z{E7z`7{}>8MMvP7+3d>%h+uJNNWJyz0oy%JeO1B!qs!haCS#7uh`P2T z_zFS;py@{7*FqU+ixmJMZx-qI+*!P5a8U@%2+hT623?73%RgMld(P8b0|46v-TpAi ziJybHKdOfkKdZLnWTxK?ra@tqHDDt-q@8ZZlCQNyJfO>vBk1ukvE62uEs#s#K%Zgr zo`=505Fg1lnKThrK#e8u6x zuC5V)b;p)+0zG{zdQ9uWTma3~-+i1Q5%|S%v^1g{3D~073cL5y63lex9A8YA*3|B^ zEqm4_{;a7zVyPi?Uc17%!;!p;xVm-Fv`xKm-b~Q>CDBG5d`jaX#+TS^rh;|q>&wjs&q&TkkVpo`ej{j z9Noh1ZL4_va-(fr69jPG7pNvCwibCr;}daWZ?$Nu_ghU)<-6e8#}m5hZRDLm#(Y;z zCn$MB=rloX@7N8dYlEO?*}b4AS|;9tdx=m=#hgOxaW#7Bnu%jq6`j+_1TNOq4>U*~`7jFxeo9_l`_%7Dzcsl3TL|;5%$gTC# z?s1@*TzOL-BuXnN1m}4e;wt5)=)IjVaUCtsYfgjjnI5~T1x%cXV$kDLFy>RAgNmI| z3p*B*x%}(ZVA3Ye+Np1?e`iDfZEpw83YcUX*ghm*zOY$-bvW09LRN%z|H}nR*m!;pYFLey8mL&=Ze1D*IK4S5}#d zqI(63x~@pC{A8a6=D0Jd&DLl`H0}i$a(T;gKHBJ_X@MhhCZLCg3@Mm4webnpb;jSyX-uJ%4?D&y2L z+=n0zxxE#IriWJG8Q4p$#wsg0-Rtx(kYtJkPVc%T)9H$pNCv1WtlPM`=)Y>Kl_A>C z#^aac2cb_W!guq^$DGWkCBY2ve4O&y!&0XutnCZ&f^Uj`H1)i}-Lrm&Vt_u!x64}( zBxLkB=#!s-0`q6(?$aqTASmi({-6Lr*?NElDuUEy8~V5u-?nJK+t zHL247E3C6or2<-zHrEG0`|-dF^%Hp+ngv%rgj}`3$jXYq3Z(pCdsPVoxlA4{s7c@z zB>SidWF^iw?>OGWeg9*t>bybqLBM&_3TwQc0foEneu+rjr=Tfj33SS6y5|nr3^rwrklOLvjQ=+ z2e1Qf0^Ick!VUm9;$F3XWd|IAneg9w$*l$AaC5h~Jz*iAwz7xM3w9kNp_95EWz3W<`B798i#^n4k zToVwZgx-yO)Ne7$dAKA0avZ4;G0NuuFCS@@HI%9p9#?-~FrML}>t$!r_1HJggYM*x zRvolY1QjJZ4(t+77JQ663&Un7SIBq(MEUqTVcyWw^a|(M_Kyn2Zvd>f5$u}kJ9;{5 zR1HC1S!`t9N!afMFP;As0cSF@{0qeNQD!qlb_2hX1|Tjp0`si_RCGgo&b->p20*J` zPHYgOW>H89Y!~r8z!NI~NkpgmR)9kWdDS+X!)+iU$K^%V*`u=st1Rm^tBpd8&Z^-4 zy8=dM(X!?zI=QM4yA@G7Fdy#DZLx#vAb0r^sZo>~q!>@mepC~Tg^+z;XIH*p+xhlV zm%L)v>6BvdY;6EjSrcIPS-iCA?%3}FI)BcjqRi=Dv6I(RO~4~(5F&h2BubSZfGAb@ zr6O%(ND60NAJ$7&1(nS0A1MUDW>shyroSu5<&-68Z^s6~>G_?j{T5(2?j3Q{*+sd1 zoA2xR%JTERZ-_;NMMT1Qt-FWtP{O`Xp{-&GoC6wx?$m(TFc6C3`=QX>?TY8^O)(xl zybA&}F^#cqT7vy2i#1n{#*niAkkyA{mC$NGbTi;;*XS#Lpm)mYg7fi-O^S5sHo5-!h8b4x<3=cz9(FC;}S1 zvi1_fT&ME?N22-<^gC9ki3kyCERAQwOQFj}xRq+-HO1L1O^+v~tChsYo{>5k&z6T+ z0)Dkqd43W>k~loKPFh*B(2veh{t`fj4xz?K@Y3!A_e8A-C?8^y(w8IefqF9vRTj<- zEP)ZYPv<$wfHp9dE~|~P5cY1qxlnYm#ChDPsQMEkBU$5TeW*R~(qvb0LFeufq&(M@ zlJET$a6#KGJBQA)UI8Vce~g*vC@{u^U&+RIgc{E^P>K)`D*{`B7&2y>$mdW8QG=Dvw8bJv5Ih~VT zR!6V8bVjddU7&$z-O&mfh~SCS%~-lM`qrTT>)TflE^3bp&=|VR?X23ZACBQkKb2&R zJY|arwne!z(mx>3I}k^ws1{ibM?DfjD(48@NS<5^#t<|GIW(wIPe4=fiI#|*O(Mwp zT)d)3VVOM?;clUy`JAi{wz#x+gqTXuNmv?A6_Xq0}(Qq~u(Q@B7lDCsTg&0Wx$F%`bcW< zji`p8q>qE`5hZ*FN=5d8K_J2ma_ZlE^z!EJy8EX7hjv8pqgg8ig+YqPj|x*9gozJM zMO548#S-_jBLwa6Oaz_yUwAZ|pC3sFmx-6GHj2$QQgI8I_0j_0iL%nnLY|;uRkPoJsX=(ds(dH zAHhqj9Vu<4i~9@d>0jC-vgb|&4yYAexp@Lq@v)f+vhU)r{@|sSMsq<<5d~JkR-WFZdi@JY?a|Xj`I&S>q1TCw6y%sQ8gOEs&(Kn^prU}dRR7c~*u%OP%L;oL&}_ITF;UG3_Pou|PXS`F5^6}H?6SVL1lUHD(vj+^|59-@_r-iC=-p;&CiZ-B*-*V&E!hISIz6*!;2XblfY2KY; zI}!`M6G$5;9Hm-0so}s`6C865N4DS}@yD*mp|sU4)HA za?oG{jrdN#P?RTFfbaAWoWDWvo$i2C)*5`LcR_lw>kyR*k9YFTm+B!%E8!T(WQek? zw%0b`h(ovl4Q}EqEa_toR*;<>UKK|w*kuV;)`R|D(6x~-k+ZH2 zdoST~emT!sdK@n2<4)`&mrl=$O2Er3`N|x@&r0Gy?JP(BtobP$Y9K|_ zAh!;)BzLX+BRaVB=r~6kd{?q{#ZqEG+^G)Cq(`85H_{rK*$MYX=NE%>&My;jbE)9g zsq>1jRpH+2;ME)!m`Ljn^YUKb1`oFH_oYf6@nFvv+6|ql{ zSh)uEiA9GIDDoC81J6EA9=^lw*}-_*+*k&9>8^6KVPsjO$rvP()5JAE+$~w#NkT#w z=kS6x@+M5e%|dnu3E27omqbJ5r#y|`=wLp;nL*c^AT#UBvi2zbhOa}iA(daSFT8s6 z6n1w%r92sk4uM)Z=;UR<>B-PC*$GmI6-sCR0w(#*I;{Yde>)U$vkSqRFF(VO3ZXfZ zd2=CuC@qKM9z34P=tv7L05)@7Ho+pgtxB~2_77>DDBW(|Ids;2%!K7A_}%gBy|Pef zp?_8BgAdlrixzc$Kv~;IoCEFzYP}X3n)$~{R>BX#!XYkP;OHo<`<%IY#yoHz=gk$< zMnPFG(fU*0zMiE3XrEQA(1_kZH--dMaezGjC1xl&)lB!9Uw~bC_;vaJP`vwRT8kL=HBhAle&V19wxWyjAGJd>e1!%+L=Rl8^w?-l zQbN>)%r>l(+P&vu@gf_;Em+_Z!rXzr^JS@{r+z}qV9s1XNO#>U^UOXNZam_Z32G!T z+ZIq~Ei{llx4bb&FaB>_(mPZ;p8diC{8`>VC-%>YJ;8TfpBd>M{opeeLS%T&Jwf*N zR?MsICRP|ci@`N098;H|SQb8Rd4{|3P;7wa9 zs?0z*&w^Cbv5bfp&K=}!8O?V4^b!881b)6%g-8y7AGIYQK+L6y)8X;(Avj_D)DHY5 zh=Pc<=kcE1Yv3Gr1R7aS#7-WBOvwirL|8E2gi<~Wr6NPL@I1HjPPr`Ty*$xBj$(k` z3k9{W7V8uZ$eZbygMxlpd0e9yA3y5Wb|f5XK#a(oLn+QN%FG^>F#+HP#Vle$Hmci) zfmnPms*FX1oii932e<}6(ttn(?oB3Ogm%FF4~q1^GNQkVUtEkY4>9$()>vC>B9$#8 z9jR+fGK-EfW<;Qb#FidXEs27>$_r`|TOSo%6@~j+R7Z3#&wLvaE`a>t`<)3t;^0j~ z^a%h@DaZ$SN`rOq+zs^gw)Yb|Ore(fHHq*QxK>?yZ%VhKZ%}_H$UdI9y8jdz&d<3oU07J%EVl&v0y4LU5KXRCc#~ zrgJF2F@qz_w>^*{^Fzj)83wp4Jfse#5JqOwan`{;a6bb{to$WNx6_1;fNH-dJ8Ka6 z{Ba|~7h6J%8=y+u^cB@6xEL^}Su0lGpv`w`%ah?U?;2fAX-!f##5@s(za{(zV*r1P zLx9x!)-%}+ra}#J-8k=6tQ7&b^K=rrP-tUk7d`vD% zFl(seiKBbj=;s20se{F9-x|Ss3@U4S5pVm7+g5ojwQKqx5?Nm#( zWxl#pJsl-&R8s!2KMEOavsk#C-G{fut%rWb2TE;@IOZTr(czNyAB->Wn7MZmm({ct*!oz3_9paIj#gBbmo1ZsiXX%<@OQ1 z47Hjo=EEzh;hM-`<$I(k-$?8k(#d{gdthdB#OFfXyGnl2BR^=pVp(r+laL-Jr~rk8 ztgIdJf@v5(|I^}IDYOK)_L`k#^5|!MD;SnE*Ya`x-hm`!uz}@nv_6gpKBViP?2#Ji z-57BZ?eZ(976If3JLW#e7@%EugK^$YFpY>KS>S1Wk|AkT}DdHeC_zU_PeY|uX&^v?$U-@ZXr<{dWa zDgvLxH`>(S<8C1A7)NO3=HBWuifVT<#u1zxC%PmGKu_q6e4a7h#(BKs8i*@jngHCK zPe|xFqsa39gZJ%UaCk1Ud#Edabm5U4o=!JVXuAWqyY27W{UWtlZs13&!zw*`iuzau&ntoBraV#G^om5Ju=WES^irUxAAL z6_^Eap_n*2tJ?yKJye{$BU-#`Q?*jp*c5vdzp8}W9?+X>2q|}a5%BvSOt%}nX@1~! z`cFws=kX6AHsh9R1u53nqNWTpo{9x2)9*37J~dPk-!Lehk?Ldn$3{V77JD#;rT0e0_y&7mtD{#4)gjKY}NY z;%z*a-m*Bm=u0iMo7B5lE&*N+m{8S#%qx57QX96v-sM`m>o?C;Yk)^iK8TQ%Kcy@> zti`*cL3y!)b^Ys9vns-^Kl_u}T%Zyuc%*%x3LLS+p)5LtdqHo$86IYGbRxR3bGM;O$ZM3hP$KzmH zZ#C3KaW@`qk}^RWOcP;ufCuy!Kr3lW=>ARYYPF4#OM;bLJKp~opH1obF>vxb;J7em zSOECb++h3K{Vq4usY?g%@fm#S(y{cDY6(IM(ryuDj!NjHzO0hh@Xz0YZqWV*foV9dsx= z>uV}B$7eN?Mi6oT4*8~xseRPMhgd}Xu_&|^>Gg{qUc(i*p6`Dvomy$@=htOu4-$TK zO|O227tOA^1l@^XUrvskQ%^h}K*2fvmSuSB{j^Cf=a2nrc|o~*Tg1Y@{SFq&I6j=; zliqedvV6F}%H`AzD}!@!bkC!~+s2&Q?66YelBd1xel=x}fLkTQTkqw$8TzWk|R+wgs>Bu4p1Ta{hvtEBSWi z!9bb59SYCOnPk$c_>9_iejLp^9Ctg_-;qys_mFqF+uXf|*wN%AridI{ci6XHuy3=c z9RcGH!mH+`+jerqHEvMHBdLpRA<4%_Txf5*-a9F=&E?)Y;rbxz* z5r-p{PC8=ka^LS3y&q|EA(qbO_G5#KZr7%~+X-g+Wgxa$-;^~~6vT8BfTV*}0Cg!1 zwCmjN2(I>|Sh|-~eKL#W)YE*cja)1h?4uumn$wIUtK|P zwWup%+_qTg#`ur8(g~ZFicKs=#`oO9^4$(w3O5z>trQG4yAXIfWDar;DSAF@qgW_0 znP_`(e4^HJlHbYYaagR`M8cIJ%`DsN`~u4o}3OTpS{{Pkt1=V|M;`>gn{Ow z^Um1@hwn{lh1#~~C*S&#MaVV#*|VDDcm|<3IQ@d3XN99*_L3M0Xr( z^}?Y&ageuDt>d97+`h75z0Xzp$?NgyDCeqacTd8i&-}}kuvEFy%pNd6K);Q&t2H)| z(BdnR=KZkVu)XH9^@fwu>z%(`ZYrw5hC6AIk3owK&XK35js((*%n>z1j7R zuu2B2n~Ft8t{Ml_xbPNOSdrkVBw!?xL92ZyW6OFQce}KflnIi~aC*9xlm0*kq+2Q^Mnf_UGt~72$MkE$k%3<9nn&aPK@&cE4o(GW&nFs4PP0| zN~mRQ=dCIUBj8eM>s8&akShlLbSd|5Lg588-xezlf{!i$^eqpQSwA#Wn{7nz{=j5&Vi@TBv0~=b5{A0yD_S=TZWtpBlgW9+ zpv9NZ&wLyH=zH!{oqxfbpMB)wOcIu16d`9Lrd)G1)}p99tbdt3{{X?U7%uVQC1oIZ zMYA^ElO1?jsesLK{UYH>HR41bWJh^Oi5yXnMIU`um;etKgIz z*hKMuuQ5oqxf$Aa4a)>0hF#Iu= zifx96@y;wNcUM7_94o^CQ5)e#1H?gOqo+#?y;AgvZKnhus(gDaG2{pcu&Yrh&mT#B=wA2fp*iRAw25B6G zhJVkhpH;(~C#az4bHM0h;`bhEAk8(ZZ$3z(dfy^jYyFY%VB!~4r|qW*O4LDgi*%B# z(}Kk zs#l=?4=|K4%7y|y(44>Y?G)Fki2L5k+M8%$SFj~GviZU6UT*}ST;uZTz|4&a;yE7i zTT?Jtq#JKm_fB21MH%z9wGLXRPeL)+? z;=+5$rFKg-`t&lB2WL2C{{EI)baY`HHFP2!VOG3}xb)bUqL)Ynax>jr`9jU_kp>(A zDvCTE$o7l3k0Z%MU8G9X1aea#hg#3t)mt4MWrA&P*}dcKX7OA9fLk~kb7b5# z>iF*}GIFcGbm*tY*OUlb(nj_*UyEFvWbINK=oRqxPMB?q<#{1z068-7tfT+kv)u81 z3d~#c3wnoaVD?ncIG9ut0cgdYQwHzJCQvy@w2c;U$vy!=2? zsFBRYLdSk0Y7<0;6Y^PgDY#oX9p}gJVA4v}mefG>Z2~pU0Q4elNLV6T!uw3`F${1Q z>NE@O*qlj@%(jVhuV6%O;q{H;_!gr>Q6%9u zp@=erJc*WvI6=iw;j~O>CKY+(%Fy$oG0!VXl|2g89?a|#T5{$Go8k9?G;hjpA#Y|_ zloH9$Cfl=iyB@bNjiovcq0q-1=>BMhk6VK9v;~tNI*UrR&#QHhxrPpi_H$UpLNe7+ z`tgm=Sh=z2X&j_=KD%wsj5u#L%5AyNmb`W)#E~gU;^yYgFAHhrj8M&yI%~eg6QbwN zP&T|m(Z3Uiir*FAnxoFVMJjO{E6xWyH?uKy@#sjcCnE6eN}1p0bnnCE4FnjM(M?QF zt!E9O)oXxoymBUyJi@E_pTP%l+bwV+FpyhASd*A)%S0X0~$K*j1Vd%sohM$@1NMDkFju5zx%Y`sNVPP zYjIND&BIKV^V=L%Kz`AO^yv@=V-H-XVo#AJQdIn;K9OX0^ciGvaXh{e(3S-?g?$K5 zN~>IJo7PO`>t8V>zpR`5ih|8MC)D*J0Hj*Bk0P6X|AnhW(tonL!FNbs>AMgn3B9)p z-8)70lj2`bujEZnMsg%so5g}+DFsaU^|bo84dY(0OP);C0m+;SqKK$>xweY~$waki z`v6=N83{C{gsjm_+~8+Rt}vxuMkZ9F3@Oy=FYhW&$g|j@h@ZY>xQ_MU0*pF`)ze{b zO1596h8wJ)&tsLVC8%QkcAU}kP%5$wd>lBe9NHIt=pAPTT}FN#3U#4?UO~$P&DcPn zMMiy%93-{ENe`#P<$z}AFGn>_9^SCs^WJBJ9f`G`Vl|%BH@beLEwHbLb7mbh1Y$tP_G2v?=B zQx@-bP^cCm@2U@iR~Rm}z>E3M{YHB%DOB(IyIP#yD03>^7(y|Xw7ZKx_RoP-2k-yM zni-k|^)+tPu4$}=O2r$QQpds^Rs6G2=7jkbf2Tg}v8Qr4y3(V*u>31@JnZymf*Qg+ zN`x`N{l>#aeeTp7*0SVk-R#FW_r^XRtP1wF&y{&{NuzV3P5f0Nh)2-$(e6Q|uMfoR zgHoJKE15_lzFOxK@o-t>+Wr)_I=kK>K;eNKXT#I8O*z$zpAzcac5uNL=s22y39!{T zS^nsN(#OXeyXMKasNv_B4dL`jebG;g_VBa`GeZY=(MfWQD?c+0SvcHIcJjIG{DNFW zpRyY)x#YapQ(ect-F!u-`u?qc^$SRT?W*IVbQM|Sk7_oIZCv{8oN)KySA)kdP<%p+ z)lwiFrIOun84#tmvTOzED#HDTWvwmzx+?ohpI@gw)NX7*61W2Hf+0XU6CLDU<(k9&h8zM!^WImaUMEK4n?CJLoe2W zpeMvZ8I!t?7vVCab`lH5Nj;SFX8`-;d}DHJyQQA(3vLg^q>u|T=c@7s(J7wTzm3*8 z53ugPAn6rB^kB`3JP_vk+TWU#<#(SVQ61;~Lp5qD3Z@d~JLPo>R`FnWpS-5(RvbAWLm)-(v{o z9%_dARKwKsWSJ^@!s0ga%+1ciq4t5L*;S09hc2g^+z|J+VrjJPYib&fe00ybq-%GO zv!QgGGC3L>Kbe=~8_iyUni;Gg9cm&q6LyrA$akGyu{|e$vBFZB?dl{)MBYt&Lqsb6 zy5XP({Di5ToU^y|0z#zo%2d4v714{dmGjsu{S#sH|CBLdi5g$tAOFJRF=X)?uNWM+HZtJ< zn}289?aBw;zk9rusI=W~_6JeiztO&=ycF54e%%JLu>B0>dP+OR^{(OyV{j)0(4Fn{ z_?q`Kz_Q$ivO@W?1%Fv~Xy+%(f-d<p{`og~Q2O*aJ5}cW_O}iKxBs;Z8#!9=-MsQX3iX=geAtocbiozmPsRSA3RX zwu_8=y + +--- + +## 1 Motivation + +### 1. Independent, fine-grained scaling + +* Vision encoders are lightweight, while language models are orders of magnitude larger. +* The language model can be parallelised without affecting the encoder fleet. +* Encoder nodes can be added or removed independently. + +### 2. Lower time-to-first-token (TTFT) + +* Language-only requests bypass the vision encoder entirely. +* Encoder output is injected only at required attention layers, shortening the pre-fill critical path. + +### 3. Cross-process reuse and caching + +* In-process encoders confine reuse to a single worker. +* A remote, shared cache lets any worker retrieve existing embeddings, eliminating redundant computation. + +--- + +## 2 Usage Example + +The current reference pathway is **SharedStorageConnector**. +Below ready-to-run scripts shows the workflow: + +1 Encoder instance + 1 PD instance: +`examples/online_serving/disaggregated_encoder/shared_storage_connector/disagg_encoder_example.sh` + +1 Encoder instance + 1 Prefill instance + 1 Decode instance: +`examples/online_serving/disaggregated_encoder/shared_storage_connector/disagg_epd_example.sh` + +--- + +## 3 Test Script + +Please refer to the directories `tests/v1/ec_connector` + +## 4 Development + +Disaggregated encoding is implemented by running two parts: + +* **Encoder instance** – a vLLM instance to performs vision encoding. +* **Prefill/Decode (PD) instance(s)** – runs language pre-fill and decode. + * PD can be in either a single normal instance with `disagg_encoder_example.sh` (E->PD) or in disaggregated instances with `disagg_epd_example.sh` (E->P->D) + +A connector transfers encoder-cache (EC) embeddings from the encoder instance to the PD instance. +All related code is under `vllm/distributed/ec_transfer`. + +### Key abstractions + +* **ECConnector** – interface for retrieving EC caches produced by the encoder. + * *Scheduler role* – checks cache existence and schedules loads. + * *Worker role* – loads the embeddings into memory. + +Here is a figure illustrating disaggregate encoder flow: + +![Disaggregated Encoder Flow](../assets/features/disagg_encoder/disagg_encoder_flow.png) + +For the PD disaggregation part, the Prefill instance receive cache exactly the same as the disaggregate encoder flow above. Prefill instance executes 1 step (prefill -> 1 token output) and then transfer KV cache to the Decode instance for the remaining execution. The KV transfer part purely happens after the execute of the PDinstance. + +`docs/features/disagg_prefill.md` shows the brief idea about the disaggregated prefill (v0) + +We create the example setup with the **NixlConnector** from `vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py` and referred to the `tests/v1/kv_connector/nixl_integration/toy_proxy_server.py` to facilitate the kv transfer between P and D; diff --git a/examples/online_serving/disaggregated_encoder/README.md b/examples/online_serving/disaggregated_encoder/README.md new file mode 100644 index 000000000000..60dfae24552f --- /dev/null +++ b/examples/online_serving/disaggregated_encoder/README.md @@ -0,0 +1,13 @@ +# Disaggregated Encoder + +This example contains scripts that demonstrate the disaggregated encoder (EPD) features of vLLM. + +Please refer to [Disaggregated Encoder Feature](../../../docs/features/disagg_encoder.md) for the detailed explanation for the EPD features. + +## Files + +- `disagg_epd_proxy.py` - Proxy to demonstrates XeYpZd (X encode instances, Y prefill instances, Z decode instances); Currently stable for 1e1p1d. +- `disagg_1e1p1d_example.sh` - Setup 1e1p1d and run VisionArena benchmark. +- `disagg_1e1pd_example.sh` - Setup 1e1pd and run VisionArena benchmark. + +Detailed explanations are commnented in the scripts. diff --git a/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh b/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh new file mode 100644 index 000000000000..dc103b28bf51 --- /dev/null +++ b/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh @@ -0,0 +1,193 @@ +#!/bin/bash +set -euo pipefail + +declare -a PIDS=() + +############################################################################### +# Configuration -- override via env before running +############################################################################### +MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}" +LOG_PATH="${LOG_PATH:-./logs}" +mkdir -p $LOG_PATH + +ENCODE_PORT="${ENCODE_PORT:-19534}" +PREFILL_PORT="${PREFILL_PORT:-19535}" +DECODE_PORT="${DECODE_PORT:-19536}" +PROXY_PORT="${PROXY_PORT:-10001}" + +GPU_E="${GPU_E:-2}" +GPU_P="${GPU_P:-2}" +GPU_D="${GPU_D:-3}" + +EC_SHARED_STORAGE_PATH="${EC_SHARED_STORAGE_PATH:-/tmp/ec_cache}" +TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-12000}" # wait_for_server timeout + +NUM_PROMPTS="${NUM_PROMPTS:-100}" # number of prompts to send in benchmark + +export UCX_TLS=all +export UCX_NET_DEVICES=all + +############################################################################### +# Helpers +############################################################################### +START_TIME=$(date +"%Y%m%d_%H%M%S") +ENC_LOG=$LOG_PATH/encoder_${START_TIME}.log +P_LOG=$LOG_PATH/p_${START_TIME}.log +D_LOG=$LOG_PATH/d_${START_TIME}.log +PROXY_LOG=$LOG_PATH/proxy_${START_TIME}.log + +wait_for_server() { + local port=$1 + timeout "$TIMEOUT_SECONDS" bash -c " + until curl -s localhost:$port/v1/chat/completions > /dev/null; do + sleep 1 + done" && return 0 || return 1 +} + +# Cleanup function +cleanup() { + echo "Stopping everything…" + trap - INT TERM USR1 # prevent re-entrancy + + # Kill all tracked PIDs + for pid in "${PIDS[@]}"; do + if kill -0 "$pid" 2>/dev/null; then + echo "Killing process $pid" + kill "$pid" 2>/dev/null + fi + done + + # Wait a moment for graceful shutdown + sleep 2 + + # Force kill any remaining processes + for pid in "${PIDS[@]}"; do + if kill -0 "$pid" 2>/dev/null; then + echo "Force killing process $pid" + kill -9 "$pid" 2>/dev/null + fi + done + + # Kill the entire process group as backup + kill -- -$$ 2>/dev/null + + echo "All processes stopped." + exit 0 +} + +trap cleanup INT +trap cleanup USR1 +trap cleanup TERM + +# clear previous cache +echo "remove previous ec cache folder" +rm -rf $EC_SHARED_STORAGE_PATH + +echo "make ec cache folder" +mkdir -p $EC_SHARED_STORAGE_PATH + +############################################################################### +# Encoder worker +############################################################################### +CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \ + --gpu-memory-utilization 0.0 \ + --port "$ENCODE_PORT" \ + --enable-request-id-headers \ + --no-enable-prefix-caching \ + --max-num-seqs 128 \ + --max-num-batched-tokens 4096 \ + --ec-transfer-config '{ + "ec_connector": "ECSharedStorageConnector", + "ec_role": "ec_producer", + "ec_connector_extra_config": { + "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'" + } + }' \ + >"${ENC_LOG}" 2>&1 & + +PIDS+=($!) + +############################################################################### +# Prefill worker +############################################################################### +CUDA_VISIBLE_DEVICES="$GPU_P" \ +UCX_NET_DEVICES=all \ +VLLM_NIXL_SIDE_CHANNEL_PORT=5559 \ +vllm serve "$MODEL" \ + --gpu-memory-utilization 0.7 \ + --port "$PREFILL_PORT" \ + --enable-request-id-headers \ + --max-num-seqs 128 \ + --ec-transfer-config '{ + "ec_connector": "ECSharedStorageConnector", + "ec_role": "ec_consumer", + "ec_connector_extra_config": { + "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'" + } + }' \ + --kv-transfer-config '{ + "kv_connector": "NixlConnector", + "kv_role": "kv_producer" + }' \ + >"${P_LOG}" 2>&1 & + +PIDS+=($!) + +############################################################################### +# Decode worker +############################################################################### +CUDA_VISIBLE_DEVICES="$GPU_D" \ +UCX_NET_DEVICES=all \ +VLLM_NIXL_SIDE_CHANNEL_PORT=6000 \ +vllm serve "$MODEL" \ + --gpu-memory-utilization 0.7 \ + --port "$DECODE_PORT" \ + --enable-request-id-headers \ + --max-num-seqs 128 \ + --kv-transfer-config '{ + "kv_connector": "NixlConnector", + "kv_role": "kv_consumer" + }' \ + >"${D_LOG}" 2>&1 & + +PIDS+=($!) + +# Wait for workers +wait_for_server $ENCODE_PORT +wait_for_server $PREFILL_PORT +wait_for_server $DECODE_PORT + +############################################################################### +# Proxy +############################################################################### +python disagg_epd_proxy.py \ + --host "0.0.0.0" \ + --port "$PROXY_PORT" \ + --encode-servers-urls "http://localhost:$ENCODE_PORT" \ + --prefill-servers-urls "http://localhost:$PREFILL_PORT" \ + --decode-servers-urls "http://localhost:$DECODE_PORT" \ + >"${PROXY_LOG}" 2>&1 & + +PIDS+=($!) + +wait_for_server $PROXY_PORT +echo "All services are up!" + +############################################################################### +# Benchmark +vllm bench serve \ + --model $MODEL \ + --backend openai-chat \ + --endpoint /v1/chat/completions \ + --dataset-name hf \ + --dataset-path lmarena-ai/VisionArena-Chat \ + --seed 0 \ + --num-prompts $NUM_PROMPTS \ + --port $PROXY_PORT + +PIDS+=($!) +############################################################################### + +# cleanup +echo "cleanup..." +cleanup \ No newline at end of file diff --git a/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh b/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh new file mode 100644 index 000000000000..ecc8f9b5a86c --- /dev/null +++ b/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh @@ -0,0 +1,162 @@ +#!/bin/bash +set -euo pipefail + +declare -a PIDS=() + +############################################################################### +# Configuration -- override via env before running +############################################################################### +MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}" +LOG_PATH="${LOG_PATH:-./logs}" +mkdir -p $LOG_PATH + +ENCODE_PORT="${ENCODE_PORT:-19534}" +PREFILL_DECODE_PORT="${PREFILL_DECODE_PORT:-19535}" +PROXY_PORT="${PROXY_PORT:-10001}" + +GPU_E="${GPU_E:-0}" +GPU_PD="${GPU_PD:-1}" + +EC_SHARED_STORAGE_PATH="${EC_SHARED_STORAGE_PATH:-/tmp/ec_cache}" +TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-12000}" # wait_for_server timeout + +NUM_PROMPTS="${NUM_PROMPTS:-100}" # number of prompts to send in benchmark + +############################################################################### +# Helpers +############################################################################### +START_TIME=$(date +"%Y%m%d_%H%M%S") +ENC_LOG=$LOG_PATH/encoder_${START_TIME}.log +PD_LOG=$LOG_PATH/pd_${START_TIME}.log +PROXY_LOG=$LOG_PATH/proxy_${START_TIME}.log + +wait_for_server() { + local port=$1 + timeout "$TIMEOUT_SECONDS" bash -c " + until curl -s localhost:$port/v1/chat/completions > /dev/null; do + sleep 1 + done" && return 0 || return 1 +} + +# Cleanup function +cleanup() { + echo "Stopping everything…" + trap - INT TERM USR1 # prevent re-entrancy + + # Kill all tracked PIDs + for pid in "${PIDS[@]}"; do + if kill -0 "$pid" 2>/dev/null; then + echo "Killing process $pid" + kill "$pid" 2>/dev/null + fi + done + + # Wait a moment for graceful shutdown + sleep 2 + + # Force kill any remaining processes + for pid in "${PIDS[@]}"; do + if kill -0 "$pid" 2>/dev/null; then + echo "Force killing process $pid" + kill -9 "$pid" 2>/dev/null + fi + done + + # Kill the entire process group as backup + kill -- -$$ 2>/dev/null + + echo "All processes stopped." + exit 0 +} + +trap cleanup INT +trap cleanup USR1 +trap cleanup TERM + +# clear previous cache +echo "remove previous ec cache folder" +rm -rf $EC_SHARED_STORAGE_PATH + +echo "make ec cache folder" +mkdir -p $EC_SHARED_STORAGE_PATH + +############################################################################### +# Encoder worker +############################################################################### +CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \ + --gpu-memory-utilization 0.7 \ + --port "$ENCODE_PORT" \ + --enforce-eager \ + --enable-request-id-headers \ + --no-enable-prefix-caching \ + --max-num-batched-tokens 4096 \ + --max-num-seqs 128 \ + --ec-transfer-config '{ + "ec_connector": "ECSharedStorageConnector", + "ec_role": "ec_producer", + "ec_connector_extra_config": { + "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'" + } + }' \ + >"${ENC_LOG}" 2>&1 & + +PIDS+=($!) + +############################################################################### +# Prefill+Decode worker +############################################################################### +CUDA_VISIBLE_DEVICES="$GPU_PD" VLLM_NIXL_SIDE_CHANNEL_PORT=6000 vllm serve "$MODEL" \ + --gpu-memory-utilization 0.7 \ + --port "$PREFILL_DECODE_PORT" \ + --enforce-eager \ + --enable-request-id-headers \ + --max-num-seqs 128 \ + --ec-transfer-config '{ + "ec_connector": "ECSharedStorageConnector", + "ec_role": "ec_consumer", + "ec_connector_extra_config": { + "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'" + } + }' \ + >"${PD_LOG}" 2>&1 & + +PIDS+=($!) + +# Wait for workers +wait_for_server $ENCODE_PORT +wait_for_server $PREFILL_DECODE_PORT + +############################################################################### +# Proxy +############################################################################### +python disagg_epd_proxy.py \ + --host "0.0.0.0" \ + --port "$PROXY_PORT" \ + --encode-servers-urls "http://localhost:$ENCODE_PORT" \ + --prefill-servers-urls "disable" \ + --decode-servers-urls "http://localhost:$PREFILL_DECODE_PORT" \ + >"${PROXY_LOG}" 2>&1 & + +PIDS+=($!) + +wait_for_server $PROXY_PORT +echo "All services are up!" + +############################################################################### +# Benchmark +vllm bench serve \ + --model $MODEL \ + --backend openai-chat \ + --endpoint /v1/chat/completions \ + --dataset-name hf \ + --dataset-path lmarena-ai/VisionArena-Chat \ + --seed 0 \ + --num-prompts $NUM_PROMPTS \ + --port $PROXY_PORT + +PIDS+=($!) +############################################################################### + +# cleanup +echo "cleanup..." +cleanup \ No newline at end of file diff --git a/examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py b/examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py new file mode 100644 index 000000000000..70b9538a2f66 --- /dev/null +++ b/examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py @@ -0,0 +1,516 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +disagg_encoder_proxy.py + +Proxy that routes OpenAI-compatible “/v1/chat/completions” requests to two +clusters: + • encode (multimodal feature extraction) + • decode (language-model inference) + +For MM input we: + 1. Extract *every* image/audio item. + 2. Fire N concurrent requests to the encoder cluster + (one request per item, with **all text removed**). + 3. Wait for all of them to succeed. + 4. Forward the *original* request to a decode server. + +Usage +For E + PD setup: +$ python disagg_encoder_proxy.py \ + --encode-servers-urls "http://e1:8001,http://e2:8002" \ + --prefill-servers-urls "disable" \ + --decode-servers-urls "http://pd1:8003,http://pd2:8004" + +For E + P + D setup: +$ python disagg_encoder_proxy.py \ + --encode-servers-urls "http://e1:8001,http://e2:8001" \ + --prefill-servers-urls "http://p1:8003,http://p2:8004" \ + --decode-servers-urls "http://d1:8005,http://d2:8006" +""" + +from __future__ import annotations + +import argparse +import asyncio +import logging +import os +import random +import uuid +from collections.abc import AsyncIterator + +import aiohttp +import uvicorn +from fastapi import FastAPI, HTTPException, Request +from fastapi.responses import JSONResponse, StreamingResponse + +############################################################################### +# FastAPI app & global state +############################################################################### + +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s") +logger = logging.getLogger("proxy") + +app = FastAPI() +encode_session: aiohttp.ClientSession | None = None +prefill_session: aiohttp.ClientSession | None = None +decode_session: aiohttp.ClientSession | None = None + +############################################################################### +# Utils +############################################################################### + + +MM_TYPES = {"image_url", "audio_url", "input_audio"} + + +def extract_mm_items(request_data: dict) -> list[dict]: + """ + Return *all* image/audio items that appear anywhere in `messages`. + + Each returned dict looks like: + { "type": "image_url", "image_url": {...} } + """ + items: list[dict] = [] + for msg in request_data.get("messages", []): + content = msg.get("content") + if not isinstance(content, list): + continue + + for item in content: + if item.get("type") in MM_TYPES: + items.append(item) + return items + + +async def fanout_encoder_primer( + orig_request: dict, + e_urls: list[str], + request_id: str, +) -> None: + """ + 1. Build one request *per MM item* with all text removed. + 2. Send them concurrently to the encode cluster. + 3. Raise if any of them fails. + """ + mm_items = extract_mm_items(orig_request) + if not mm_items: + return # nothing to do + + tasks = [] + + # Round-robin over encode servers to distribute load a bit + url_cycle = (e_urls[i % len(e_urls)] for i in range(len(mm_items))) + + for idx, (item, target_url) in enumerate(zip(mm_items, url_cycle)): + # Derive a *child* request id: :: + child_req_id = f"{request_id}:{idx}:{uuid.uuid4().hex[:6]}" + headers = {"x-request-id": child_req_id} + + encoder_req = { + # You *may* need to keep additional fields + "model": orig_request.get("model"), + "messages": [ + {"role": "user", "content": [item]}, + ], + # Only need 1 token so the server actually runs the encoder path + "max_tokens": 1, + "stream": False, + } + tasks.append( + encode_session.post( + f"{target_url}/v1/chat/completions", + json=encoder_req, + headers=headers, + ) + ) + + results = await asyncio.gather(*tasks, return_exceptions=True) + + # Fail fast if any sub-request failed + for r in results: + if isinstance(r, Exception): + logger.error("Encoder request raised: %s", r) + raise HTTPException(status_code=502, detail=str(r)) + if r.status != 200: + try: + detail = await r.text() + except Exception: + detail = "" + logger.error("Encoder request returned %s: %s", r.status, detail) + raise HTTPException( + status_code=r.status, + detail=f"Encoder request failed: {detail}", + ) + + +async def maybe_prefill( + req_data: dict, + p_url: str, + req_id: str, +) -> dict: + """ + - Do prefill-only task if p_url exist; + - Return modified request data with kv transfer params (for nixl connector) + - Else, skip and return the original request data for decode + """ + if p_url: + prefill_response = await process_prefill_stage(req_data, p_url, req_id) + # for nixl connector to facilitate kv transfer... + prefill_response_json = await prefill_response.json() + kv_transfer_params = prefill_response_json.get("kv_transfer_params", {}) + if kv_transfer_params: + req_data["kv_transfer_params"] = kv_transfer_params + logger.debug("kv_transfer_params: %s", kv_transfer_params) + + return req_data + else: + return req_data + + +async def process_prefill_stage( + req_data: dict, + p_url: str, + req_id: str, +) -> dict: + """Process request through Prefill stage and return kv_transfer_params""" + logger.debug("Processing through prefill for req_id: %s/ url: %s", req_id, p_url) + + prefill_request = req_data.copy() + prefill_request["kv_transfer_params"] = { + "do_remote_decode": True, + "do_remote_prefill": False, + "remote_engine_id": None, + "remote_block_ids": None, + "remote_host": None, + "remote_port": None, + } + prefill_request["stream"] = False + prefill_request["max_tokens"] = 1 + if "max_completion_tokens" in prefill_request: + prefill_request["max_completion_tokens"] = 1 + if "stream_options" in prefill_request: + del prefill_request["stream_options"] + + headers = {"x-request-id": req_id} + try: + prefill_response = await prefill_session.post( + f"{p_url}/v1/chat/completions", json=prefill_request, headers=headers + ) + prefill_response.raise_for_status() + + if prefill_response.status != 200: + error_text = await prefill_response.text() + raise HTTPException( + status_code=prefill_response.status, + detail={"error": "Prefill request failed", "message": error_text}, + ) + logger.debug("Prefill processing completed successfully for req_id: %s", req_id) + + return prefill_response + + except Exception as e: + logger.error("Prefill processing failed: %s", str(e)) + raise HTTPException( + status_code=500, + detail={"error": "Prefill processing error", "message": str(e)}, + ) from e + + +############################################################################### +# FastAPI lifecycle +############################################################################### + + +@app.on_event("startup") +async def on_startup() -> None: + global encode_session, prefill_session, decode_session + timeout = aiohttp.ClientTimeout(total=100_000) + connector = aiohttp.TCPConnector(limit=0, force_close=False) + encode_session = aiohttp.ClientSession(timeout=timeout, connector=connector) + if app.state.p_urls: + # only setup if prefill instance(s) exist + prefill_session = aiohttp.ClientSession(timeout=timeout, connector=connector) + decode_session = aiohttp.ClientSession(timeout=timeout, connector=connector) + + +@app.on_event("shutdown") +async def on_shutdown() -> None: + global encode_session, prefill_session, decode_session + if encode_session: + await encode_session.close() + if prefill_session: + await prefill_session.close() + if decode_session: + await decode_session.close() + + +############################################################################### +# Core forwarding +############################################################################### + + +async def forward_non_stream( + req_data: dict, req_id: str, e_urls: list[str], p_url: str, d_url: str +) -> dict: + # Step 1: Process through Encoder instance (if has MM input) + await fanout_encoder_primer(req_data, e_urls, req_id) + + # Step 2: Process through Prefill instance + req_data = await maybe_prefill(req_data, p_url, req_id) + + # Step 3: Process through Decode instance + logger.debug("Getting response from decode for req_id: %s/ url: %s", req_id, d_url) + headers = {"x-request-id": req_id} + + # Non-streaming response + async with decode_session.post( + f"{d_url}/v1/chat/completions", json=req_data, headers=headers + ) as resp: + resp.raise_for_status() + return await resp.json() + + +async def forward_stream( + req_data: dict, req_id: str, e_urls: list[str], p_url: str, d_url: str +) -> AsyncIterator[str]: + # Step 1: Process through Encoder instance (if has MM input) + await fanout_encoder_primer(req_data, e_urls, req_id) + + # Step 2: Process through Prefill instance + req_data = await maybe_prefill(req_data, p_url, req_id) + + # Step 3: Process through Decode instance + logger.debug( + "Streaming response from decode for req_id: %s/ url: %s", req_id, d_url + ) + headers = {"x-request-id": req_id} + + # Streaming response + async with decode_session.post( + f"{d_url}/v1/chat/completions", + json=req_data, + headers=headers, + ) as resp: + resp.raise_for_status() + async for chunk in resp.content.iter_chunked(1024): + if chunk: + yield chunk.decode("utf-8", errors="ignore") + + +############################################################################### +# Public routes +############################################################################### + + +@app.post("/v1/chat/completions") +async def chat_completions(request: Request): + req_data = await request.json() + req_id = request.headers.get("x-request-id", str(uuid.uuid4())) + + e_urls = app.state.e_urls # we want the full list for fan-out + p_url = random.choice(app.state.p_urls) if app.state.p_urls else None + d_url = random.choice(app.state.d_urls) + + is_streaming = req_data.get("stream", False) + + if is_streaming: + return StreamingResponse( + forward_stream(req_data, req_id, e_urls, p_url, d_url), + media_type="text/event-stream", + ) + result = await forward_non_stream(req_data, req_id, e_urls, p_url, d_url) + return JSONResponse(content=result) + + +@app.get("/v1/models") +async def list_models(): + async with decode_session.get(f"{app.state.d_urls[0]}/v1/models") as resp: + resp.raise_for_status() + return await resp.json() + + +@app.get("/health") +async def health_check(): + async def healthy(urls): + if not urls: + return "empty" + for u in urls: + try: + async with encode_session.get(f"{u}/health") as resp: + resp.raise_for_status() + except Exception: + return "unhealthy" + return "healthy" + + e_status, p_status, d_status = await asyncio.gather( + healthy(app.state.e_urls), healthy(app.state.p_urls), healthy(app.state.d_urls) + ) + + overall_healthy = all( + status != "unhealthy" for status in (e_status, p_status, d_status) + ) + + status_code = 200 if overall_healthy else 503 + + return JSONResponse( + { + "proxy": "healthy", + "encode_cluster": e_status, + "prefill_cluster": p_status, + "decode_cluster": d_status, + }, + status_code=status_code, + ) + + +############################################################################### +# Simple profiler fan-out (unchanged except for sessions) +############################################################################### + + +async def _post_if_available( + session: aiohttp.ClientSession, + url: str, + payload: dict, + headers: dict, +) -> dict | None: + """ + POST `payload` to `url`. + + Returns + ------- + • The decoded JSON body on success (2xx) + • None if the endpoint does not exist (404) + • Raises for anything else. + """ + try: + resp = await session.post(url, json=payload, headers=headers) + if resp.status == 404: # profiling disabled on that server + logger.warning("Profiling endpoint missing on %s", url) + return None + resp.raise_for_status() + return await resp.json(content_type=None) + except aiohttp.ClientResponseError as exc: + # Pass 404 through the branch above, re-raise everything else + if exc.status == 404: + logger.warning("Profiling endpoint missing on %s", url) + return None + raise + except Exception: + # Network errors etc.: propagate + raise + + +async def _profile_cmd(cmd: str, payload: dict, e_url: str, p_url: str, d_url: str): + """ + Fire & forget to both clusters, tolerate 404. + """ + headers = {"Authorization": f"Bearer {os.getenv('OPENAI_API_KEY', '')}"} + + encode_task = _post_if_available( + encode_session, f"{e_url}/{cmd}_profile", payload, headers + ) + prefill_task = ( + _post_if_available(prefill_session, f"{p_url}/{cmd}_profile", payload, headers) + if p_url is not None + else asyncio.sleep(0) + ) + decode_task = _post_if_available( + decode_session, f"{d_url}/{cmd}_profile", payload, headers + ) + + encode_res, prefill_res, decode_res = await asyncio.gather( + encode_task, prefill_task, decode_task + ) + + # If *all* clusters said “I don’t have that route”, surface an error + if encode_res is prefill_res is decode_res is None: + raise HTTPException( + status_code=503, + detail="Profiling endpoints are disabled on all clusters", + ) + + return { + "encode": encode_res, # may be None + "prefill": prefill_res, # may be None + "decode": decode_res, # may be None + } + + +@app.post("/start_profile") +async def start_profile(request: Request): + body = await request.json() + # TODO: handle multi urls properly + e_url = random.choice(app.state.e_urls) + p_url = random.choice(app.state.p_urls) if app.state.p_urls else None + d_url = random.choice(app.state.d_urls) + return await _profile_cmd("start", body, e_url, p_url, d_url) + + +@app.post("/stop_profile") +async def stop_profile(request: Request): + body = await request.json() + # TODO: handle multi urls properly + e_url = random.choice(app.state.e_urls) + p_url = random.choice(app.state.p_urls) if app.state.p_urls else None + d_url = random.choice(app.state.d_urls) + return await _profile_cmd("stop", body, e_url, p_url, d_url) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--host", default="0.0.0.0") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument( + "--encode-servers-urls", + required=True, + help='Comma-separated encode URLs ("http://e1:8001,http://e2:8001")', + ) + parser.add_argument( + "--prefill-servers-urls", + required=True, + help=( + 'Comma-separated prefill URLs ("http://p1:8003,http://p2:8004") ', + 'to enable E->P->D, set "disable" or "none" to enable E->PD', + ), + ) + parser.add_argument( + "--decode-servers-urls", + required=True, + help='Comma-separated decode URLs ("http://d1:8005,http://d2:8006")', + ) + + args = parser.parse_args() + app.state.e_urls = [ + u.strip() for u in args.encode_servers_urls.split(",") if u.strip() + ] + app.state.d_urls = [ + u.strip() for u in args.decode_servers_urls.split(",") if u.strip() + ] + # handle prefill instances + if args.prefill_servers_urls.lower() in ("disable", "none", ""): + app.state.p_urls = [] + logger.info( + "Disaggregated prefill phase explicitly disabled by user. Running E + PD..." + ) + else: + app.state.p_urls = [ + u.strip() for u in args.prefill_servers_urls.split(",") if u.strip() + ] + logger.info("Disaggregated prefill phase is enabled. Running E + P + D...") + + logger.info("Proxy listening on %s:%s", args.host, args.port) + logger.info("Encode servers: %s", app.state.e_urls) + logger.info("Prefill instances %s", app.state.p_urls) + logger.info("Decode servers: %s", app.state.d_urls) + + uvicorn.run( + app, + host=args.host, + port=args.port, + log_level="info", + loop="uvloop", + access_log=False, + ) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 76408fba2e16..aec3c88a0427 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -8,6 +8,7 @@ from vllm.config import ( CacheConfig, + ECTransferConfig, KVTransferConfig, ModelConfig, SchedulerConfig, @@ -20,6 +21,8 @@ PlaceholderRange, ) from vllm.sampling_params import SamplingParams, StructuredOutputsParams +from vllm.utils.hashing import sha256 +from vllm.v1.core.kv_cache_utils import get_request_block_hasher, init_none_hash from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput from vllm.v1.core.sched.scheduler import Scheduler from vllm.v1.kv_cache_interface import ( @@ -881,8 +884,10 @@ def _step_until_done( for _, num_scheduled_tokens in output.num_scheduled_tokens.items(): # We should be in the decode phase now. assert num_scheduled_tokens == 1 - assert len(output.kv_connector_metadata.requests) == 0 - ecos = scheduler.update_from_output(output, model_runner_output)[0] + if scheduler.connector is not None: + assert len(output.kv_connector_metadata.requests) == 0 + if scheduler.ec_connector is not None: + assert len(output.ec_connector_metadata.mm_datas) == 0 ecos = scheduler.update_from_output(output, model_runner_output)[0] all_done = True for eco in ecos.outputs: if eco.finish_reason is None: @@ -1015,7 +1020,10 @@ def test_kv_connector_basic(): ) -def test_kv_connector_unable_to_allocate(): +@pytest.mark.parametrize( + "use_ec_connector, ec_role", [(False, None), (True, "ec_consumer")] +) +def test_kv_connector_unable_to_allocate(use_ec_connector, ec_role): """ Test whether scheduler with KVConnector is able to handle unable to allocate (run out of blocks in allocate_slots(). @@ -1029,6 +1037,9 @@ def test_kv_connector_unable_to_allocate(): use_kv_connector=True, block_size=BLOCK_SIZE, num_blocks=NUM_BLOCKS, + # encoder connector should not affect test results + use_ec_connector=use_ec_connector, + ec_role=ec_role, ) NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2 scheduler.connector.get_num_new_matched_tokens = Mock(name="method") @@ -1097,7 +1108,10 @@ def test_kv_connector_unable_to_allocate(): assert len(scheduler.waiting) == 0 -def test_kv_connector_handles_preemption(): +@pytest.mark.parametrize( + "use_ec_connector, ec_role", [(False, None), (True, "ec_consumer")] +) +def test_kv_connector_handles_preemption(use_ec_connector, ec_role): """ Test whether scheduler with KVConnector is able to handle unable to allocate (run out of blocks in allocate_slots(). @@ -1112,6 +1126,9 @@ def test_kv_connector_handles_preemption(): use_kv_connector=True, block_size=BLOCK_SIZE, num_blocks=NUM_BLOCKS, + # encoder connector should not affect test results + use_ec_connector=use_ec_connector, + ec_role=ec_role, ) NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE @@ -1328,6 +1345,8 @@ def create_scheduler_with_priority( block_size: int = 16, max_model_len: int | None = None, num_speculative_tokens: int | None = None, + use_ec_connector: bool = False, + ec_role: str | None = None, ) -> Scheduler: """Create scheduler with priority policy enabled. @@ -1388,12 +1407,23 @@ def create_scheduler_with_priority( model="ngram", num_speculative_tokens=num_speculative_tokens ) + ec_transfer_config = ( + ECTransferConfig( + ec_connector="ECSharedStorageConnector", + ec_role=ec_role, + ec_connector_extra_config={"shared_storage_path": "/tmp/ec_test"}, + ) + if use_ec_connector + else None + ) + vllm_config = VllmConfig( scheduler_config=scheduler_config, model_config=model_config, cache_config=cache_config, kv_transfer_config=kv_transfer_config, speculative_config=speculative_config, + ec_transfer_config=ec_transfer_config, ) kv_cache_config = KVCacheConfig( num_blocks=num_blocks, # A large number of blocks to hold all requests @@ -1414,16 +1444,22 @@ def create_scheduler_with_priority( ) +_none_hash_initialized = False + + def create_requests_with_priority( num_requests: int, priorities: list[int], arrival_times: list[float] | None = None, num_tokens: int = 10, + mm_hashes_list: list[list[str]] | None = None, mm_positions: list[list[PlaceholderRange]] | None = None, max_tokens: int = 16, stop_token_ids: list[int] | None = None, prompt_logprobs: int | None = None, starting_idx: int = 0, + same_prompt: bool = False, + block_size: int = 16, ): """Create requests with specified priorities and arrival times.""" assert len(priorities) == num_requests @@ -1432,6 +1468,12 @@ def create_requests_with_priority( else: arrival_times = [float(i) for i in range(num_requests)] + global _none_hash_initialized + if not _none_hash_initialized: + init_none_hash(sha256) + _none_hash_initialized = True + + block_hasher = get_request_block_hasher(block_size, sha256) sampling_params = SamplingParams( ignore_eos=False, max_tokens=max_tokens, @@ -1439,29 +1481,64 @@ def create_requests_with_priority( prompt_logprobs=prompt_logprobs, ) requests = [] + + if mm_hashes_list is not None: + # NOTE: allow manual input; some mm items can have the same identifier + # no. of mm_hashes and mm_positions for each request should be identical + assert mm_positions is not None, ( + "mm_positions must be provided when mm_hashes_list is provided" + ) + assert len(mm_hashes_list) == len(mm_positions) == num_requests + assert [len(h) for h in mm_hashes_list] == [len(p) for p in mm_positions] + + # Since same identifier would imply they are identical encoder output + # Verify mm items with identical identifier are having mm_position.length + seen_hashes: dict[str, int] = {} + for i in range(num_requests): mm_features = [] - if mm_positions is not None: - mm_position = mm_positions[i] - for j, position in enumerate(mm_position): + for j, position in enumerate( + mm_positions[i] if mm_positions is not None else [] + ): + if mm_hashes_list is not None: + identifier = mm_hashes_list[i][j] + + # Verify if position length is identical + position_length = position.length + if identifier in seen_hashes: + assert seen_hashes[identifier] == position_length, ( + f"mm_hash '{identifier}' has inconsistent position lengths: " + f"previously {seen_hashes[identifier]}, now {position_length} " + f"at request {i}, position {j}" + ) + else: + seen_hashes[identifier] = position_length + else: + # Unique dummy hash for each mm item identifier = f"hash{i}_{j}" - mm_feature = MultiModalFeatureSpec( - data=MultiModalKwargsItem.dummy("dummy_m"), - mm_position=position, - identifier=identifier, - modality="image", - ) - mm_features.append(mm_feature) + mm_feature = MultiModalFeatureSpec( + data=MultiModalKwargsItem.dummy("dummy_m"), + mm_position=position, + identifier=identifier, + modality="image", + ) + mm_features.append(mm_feature) + prompt_token_ids = ( + [starting_idx] * num_tokens + if same_prompt + else [i + starting_idx] * num_tokens + ) request = Request( request_id=f"{i + starting_idx}", - prompt_token_ids=[i + starting_idx] * num_tokens, + prompt_token_ids=prompt_token_ids, sampling_params=sampling_params, pooling_params=None, mm_features=mm_features if mm_features else None, eos_token_id=EOS_TOKEN_ID, arrival_time=arrival_times[i], priority=priorities[i], + block_hasher=block_hasher, ) requests.append(request) return requests @@ -1950,7 +2027,12 @@ def test_schedule_skip_tokenizer_init_structured_output_request(): assert len(scheduler.waiting) == 1 -def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(): +@pytest.mark.parametrize( + "use_ec_connector, ec_role", [(False, None), (True, "ec_consumer")] +) +def test_priority_scheduling_preemption_and_resumption_when_out_of_kv( + use_ec_connector, ec_role +): """Test that priority scheduling preempts lower priority requests when out of KV cache space.""" # Create scheduler with very limited memory to force preemption @@ -1960,6 +2042,9 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv(): num_blocks=5, # Can hold 64 tokens (first block is null) block_size=16, # Standard block size use_kv_connector=True, + # encoder connector should not affect test results + use_ec_connector=use_ec_connector, + ec_role=ec_role, ) # Create a request and schedule it @@ -2119,3 +2204,817 @@ def _validate_chunked_prefill_settings_for_encoder_decoder( assert scheduler_config.disable_chunked_mm_input is not expect_enabled if is_encoder_decoder and not expect_enabled: assert scheduler_config.long_prefill_token_threshold == 0 + + +# ============================================================================== +# EPD (Encoder-Prefill-Decode) Encoder-cache-specific tests start +# NOTE: In E->P->D disagg case, both KV and EC Connector works in P instance +# Unless specify, the existence of KV Connector should not affect any test results +# ============================================================================== + + +def _assert_right_encoder_cache_allocated( + scheduler: Scheduler, + requests: list[Request] | None = None, + expected_total_allocated: int | None = None, +): + """Check whether encoder cache is allocated correctly.""" + encoder_cache_manager = scheduler.encoder_cache_manager + + # Verify encoder cache manager exists + assert encoder_cache_manager is not None, "Encoder cache manager should exist" + + # Verify number of cache + if expected_total_allocated is not None: + assert len(encoder_cache_manager.cached) == expected_total_allocated + if expected_total_allocated == 0: + return + + # Verify each request with MM data is in cache + cached_hashes = set(encoder_cache_manager.cached.keys()) + + for req in requests if requests is not None else []: + if req.mm_features: + mm_hashes = [f.identifier for f in req.mm_features] + req_hashes = set(mm_hashes) # unique hashes set + missed_hashes = req_hashes - cached_hashes + assert not missed_hashes, ( + f"Miss hashes in cache for request {req.request_id}: {missed_hashes} " + f"Existing encoder cache: {encoder_cache_manager.cached}" + ) + + +def _assert_right_ec_connector_metadata( + output: SchedulerOutput, + mm_features_list: list[MultiModalFeatureSpec], +): + """Verify that ECConnector metadata EXACTLY matches the input MM data""" + # Get the connector metadata + metadata = output.ec_connector_metadata + + # Create lookup dictionaries for efficient access + metadata_dict = {mm_data.mm_hash: mm_data for mm_data in metadata.mm_datas} + + # Check all required identifiers exist in metadata; and no extra + # In ECSharedStorageConnector format + # NOTE: even having same identifier, the mm_features can be different + # since their mm_position can be in different offsets, etc + identifiers_dict = {f.identifier for f in mm_features_list} + assert set(metadata_dict.keys()) == identifiers_dict + + # Verify the info matches + for i, mm_feature in enumerate(mm_features_list): + identifier = mm_feature.identifier + assert metadata_dict[identifier].mm_hash == identifier + assert metadata_dict[identifier].num_token == mm_feature.mm_position.length + + +def _assert_right_encoder_inputs( + output: SchedulerOutput, + check_exist: bool | None = True, + requests: list[Request] | None = None, + expected_encoder_inputs: list[list[int]] | None = None, + expected_total_reqs: int | None = None, +): + """Verify that requests/mm_hashes should (not) in scheduled encoder input + If check_exist is False, this function returns True + if requests are NOT in encoder inputs""" + + # Get the scheduled encoder inputs + # NOTE: scheduled_encoder_inputs is a dictionary with request id as key + scheduled_encoder_inputs = output.scheduled_encoder_inputs + + # Check if scheduled_encoder_inputs is empty as expected + if expected_total_reqs is not None: + assert len(scheduled_encoder_inputs) == expected_total_reqs + if expected_total_reqs == 0: + return + + # Number of expected enocder inputs should match number of requests + if expected_encoder_inputs: + assert check_exist and requests is not None # only support expect input exist + assert len(requests) == len(expected_encoder_inputs) + + # Check request (not) exist as expected + for i, request in enumerate(requests if requests is not None else []): + assert (request.request_id in scheduled_encoder_inputs) is check_exist, ( + f"Request {request.id} presence mismatch: expected {check_exist}, " + f"got {request.id in scheduled_encoder_inputs}" + ) + if expected_encoder_inputs: + scheduled_encoder_input = scheduled_encoder_inputs[request.request_id] + assert scheduled_encoder_input == expected_encoder_inputs[i] + + +def test_scheduler_no_ec_connector_by_default(): + """Test scheduler doesn't have EC connector by default.""" + scheduler = create_scheduler() + assert scheduler.ec_connector is None + + +@pytest.mark.parametrize("use_kv_connector", [False, True]) +def test_ec_connector_text_only_request(use_kv_connector): + """Test text-only requests don't allocate encoder cache.""" + scheduler = create_scheduler( + model="llava-hf/llava-1.5-7b-hf", + use_kv_connector=use_kv_connector, + use_ec_connector=True, + ec_role="ec_consumer", + ) + + NUM_PROMPT_TOKENS = 100 + + # Create text-only request (no mm_positions) + requests = create_requests( + num_requests=1, + num_tokens=NUM_PROMPT_TOKENS, + ) + assert not requests[0].mm_features # No MM data + + scheduler.add_request(requests[0]) + output = scheduler.schedule() + + # Should schedule + assert len(output.scheduled_new_reqs) == 1 + + # Scheduled tokens should equal prompt tokens exactly + scheduled = output.num_scheduled_tokens[requests[0].request_id] + assert scheduled == NUM_PROMPT_TOKENS, ( + f"Text-only should schedule {NUM_PROMPT_TOKENS}, got {scheduled}" + ) + + # Encoder cache should be empty + _assert_right_encoder_cache_allocated(scheduler, expected_total_allocated=0) + + # ECConnector should carry no metadata + _assert_right_ec_connector_metadata(output, mm_features_list=[]) + + # Scheduled encoder input should be empty; no mm to compute + _assert_right_encoder_inputs(output, expected_total_reqs=0) + + +@pytest.mark.parametrize("use_kv_connector", [False, True]) +def test_ec_connector_cache_hit_external_load(use_kv_connector): + """Test ec_consumer loads from external cache when hit. + A normal basic operation for EPD disaggrgation""" + scheduler = create_scheduler( + model="llava-hf/llava-1.5-7b-hf", + enable_prefix_caching=True, + # kv connector should not effect test results + use_kv_connector=use_kv_connector, + use_ec_connector=True, + ec_role="ec_consumer", + ) + + # Create MM request + NUM_TOKENS = 200 # NOTE: includes mm tokens + NUM_ENCODER_TOKENS = 100 + mm_hashes_list = [["hash_test1"]] + mm_positions = [[PlaceholderRange(offset=0, length=NUM_ENCODER_TOKENS)]] + + request = create_requests( + num_requests=1, + num_tokens=NUM_TOKENS, + mm_hashes_list=mm_hashes_list, + mm_positions=mm_positions, + )[0] + + # Mock cache hit - encoder cache exists externally + scheduler.ec_connector.has_caches = Mock(return_value=[True]) + scheduler.ec_connector.update_state_after_alloc = Mock( + wraps=scheduler.ec_connector.update_state_after_alloc + ) + + scheduler.add_request(request) + output = scheduler.schedule() + + # Should schedule prompt tokens + scheduled_tokens = output.num_scheduled_tokens[request.request_id] + assert scheduled_tokens == NUM_TOKENS + + # Should called update_state_after_alloc for external load + scheduler.ec_connector.update_state_after_alloc.assert_called_with(request, 0) + + # Encoder cache should contain mm items from request + _assert_right_encoder_cache_allocated(scheduler, requests=[request]) + + # ECConnector should carry metadata of request + _assert_right_ec_connector_metadata(output, mm_features_list=request.mm_features) + + # Scheduled encoder input should be empty; no mm to compute + _assert_right_encoder_inputs(output, expected_total_reqs=0) + + +@pytest.mark.parametrize("use_kv_connector", [False, True]) +def test_ec_connector_cache_miss_computes_locally(use_kv_connector): + """Test consumer can compute encoder locally when cache miss (fallback).""" + # encoder cache itself if it doesn't receive it from external storage + + scheduler = create_scheduler( + model="llava-hf/llava-1.5-7b-hf", + enable_prefix_caching=True, + use_kv_connector=use_kv_connector, + use_ec_connector=True, + ec_role="ec_consumer", + ) + + # Verify consumer role + assert scheduler.ec_connector is not None + assert not scheduler.ec_connector.is_producer + + # Create MM request + request_mm_missed = create_requests( + num_requests=1, + num_tokens=200, # Total (including 100 MM) + mm_positions=[[PlaceholderRange(offset=0, length=100)]], # 100 MM tokens + )[0] + + # Mock cache miss - encoder cache doesn't exist externally + scheduler.ec_connector.has_caches = Mock(return_value=[False]) + + scheduler.add_request(request_mm_missed) + output = scheduler.schedule() + + # SCHEDULER should decide to compute encoder locally (fallback) + assert len(output.scheduled_new_reqs) == 1 + + # Should schedule full prompt tokens + scheduled_tokens = output.num_scheduled_tokens[request_mm_missed.request_id] + assert scheduled_tokens == 200, ( + f"Expected 200 tokens on cache miss, got {scheduled_tokens}" + ) + + # Encoder cache should contain mm items from request + _assert_right_encoder_cache_allocated(scheduler, requests=[request_mm_missed]) + + # ECConnector should carry no metadata (missed cache) + _assert_right_ec_connector_metadata(output, mm_features_list=[]) + + # Scheduled encoder input contain mm for request_mm_missed + _assert_right_encoder_inputs( + output, + requests=[request_mm_missed], + expected_encoder_inputs=[[0]], # index 0 of the mm item + expected_total_reqs=1, + ) + + # Then MODEL_RUNNER will execute the encoder and cache the result + + +@pytest.mark.parametrize("use_kv_connector", [False, True]) +def test_ec_connector_with_partial_cache_hit_multi_round(use_kv_connector): + """Test consumer with partial cache hit (local & connector) with 2 requests.""" + scheduler = create_scheduler( + model="llava-hf/llava-1.5-7b-hf", + enable_prefix_caching=True, + use_kv_connector=use_kv_connector, + use_ec_connector=True, + ec_role="ec_consumer", + ) + + # Create MM request + NUM_TOKENS_1 = 300 # NOTE: includes mm tokens + NUM_ENCODER_TOKENS_1 = 50 + mm_hashes_list_1 = [["hash1_A", "hash1_B", "hash1_A", "hash1_F"]] + mm_positions_1 = [ + [ + PlaceholderRange(offset=0, length=NUM_ENCODER_TOKENS_1), + PlaceholderRange(offset=100, length=NUM_ENCODER_TOKENS_1), + PlaceholderRange(offset=200, length=NUM_ENCODER_TOKENS_1), + PlaceholderRange(offset=250, length=NUM_ENCODER_TOKENS_1), + ] + ] + + # Create request with 4 MM items, with 2 identical items + request1 = create_requests( + num_requests=1, + num_tokens=NUM_TOKENS_1, + mm_hashes_list=mm_hashes_list_1, + mm_positions=mm_positions_1, + max_tokens=1, # For simplicity + )[0] + + # Mock partial cache hit: 1st and 3rd missing, 2nd and 4th exist + scheduler.ec_connector.has_caches = Mock(return_value=[False, True, False, True]) + scheduler.ec_connector.update_state_after_alloc = Mock( + wraps=scheduler.ec_connector.update_state_after_alloc + ) + + scheduler.add_request(request1) + output = scheduler.schedule() + + # Should schedule all tokens + scheduled_tokens = output.num_scheduled_tokens[request1.request_id] + assert scheduled_tokens == NUM_TOKENS_1 + + # Encoder cache should contain all mm items from request + _assert_right_encoder_cache_allocated(scheduler, requests=[request1]) + + # Should call update_state_after_alloc for external load + scheduler.ec_connector.update_state_after_alloc.assert_called() + scheduler.ec_connector.update_state_after_alloc.reset_mock() + + # ECConnector should carry metadata for 2nd and 4th mm item + _assert_right_ec_connector_metadata( + output, mm_features_list=[request1.mm_features[1], request1.mm_features[3]] + ) + + # Should schedule ONLY 1 encoder input (index 0), no repeat for identical items + _assert_right_encoder_inputs( + output, + requests=[request1], + expected_encoder_inputs=[[0]], # index 0 of the mm item ONLY + expected_total_reqs=1, + ) + + # Simulate model execution 1 step + model_output = ModelRunnerOutput( + req_ids=[request1.request_id], + req_id_to_index={request1.request_id: 0}, + sampled_token_ids=[[100]], + # spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[], + ) + scheduler.update_from_output(output, model_output) + + # request1 is finished after outputing 1 token + # Finish request + scheduler.finish_requests(request1.request_id, RequestStatus.FINISHED_LENGTH_CAPPED) + + # Create another request with 4 MM items + NUM_TOKENS_2 = 400 + NUM_ENCODER_TOKENS_2 = 50 + mm_hashes_list_2 = [["hash1_C", "hash1_D", "hash1_E", "hash1_A"]] + mm_positions_2 = [ + [ + PlaceholderRange(offset=0, length=NUM_ENCODER_TOKENS_2), + PlaceholderRange(offset=100, length=NUM_ENCODER_TOKENS_2), + PlaceholderRange(offset=200, length=NUM_ENCODER_TOKENS_2), + PlaceholderRange(offset=250, length=NUM_ENCODER_TOKENS_2), + ] + ] + + request2 = create_requests( + num_requests=1, + num_tokens=NUM_TOKENS_2, + mm_hashes_list=mm_hashes_list_2, + mm_positions=mm_positions_2, + max_tokens=1, # For simplicity + )[0] + + # Mock partial cache hit: only hash1_A and hash1_C exist in connector + scheduler.ec_connector.has_caches = Mock(return_value=[True, False, False, True]) + + scheduler.add_request(request2) + output = scheduler.schedule() + + # Check + # Should schedule all tokens + scheduled_tokens = output.num_scheduled_tokens[request2.request_id] + assert scheduled_tokens == 400 + + # Encoder cache should contain all mm items from request2 + _assert_right_encoder_cache_allocated(scheduler, requests=[request2]) + + # Should call update_state_after_alloc for hash1_C, ONLY + # hash1_A should not be loaded from connector + # since it's computed in last request & exist in local cache + # Order of getting encoder cache should be: local cache -> connector-> compute + scheduler.ec_connector.update_state_after_alloc.assert_called_with(request2, 0) + scheduler.ec_connector.update_state_after_alloc.assert_called_once() + + scheduler.ec_connector.update_state_after_alloc.reset_mock() + + # ECConnector should carry metadata for hash1_C only (index 0) + _assert_right_ec_connector_metadata( + output, mm_features_list=[request2.mm_features[0]] + ) + + # Should schedule 2 encoder input hash1_D and hash1_E (index 1, 2) + _assert_right_encoder_inputs( + output, + requests=[request2], + expected_encoder_inputs=[[1, 2]], + expected_total_reqs=1, + ) + + +@pytest.mark.parametrize("cache_exist", ["local", "connector_only", "no_where"]) +@pytest.mark.parametrize("use_kv_connector", [False, True]) +def test_ec_connector_schedule_multiple_requests(cache_exist, use_kv_connector): + scheduler = create_scheduler( + model="llava-hf/llava-1.5-7b-hf", + max_num_seqs=10, # allow multiple requests + max_num_batched_tokens=2048, + enable_prefix_caching=True, + use_kv_connector=use_kv_connector, + use_ec_connector=True, + ec_role="ec_consumer", + ) + mm_hashes_list = [[f"hash_{i}"] for i in range(10)] + mm_positions = [[PlaceholderRange(offset=i, length=100)] for i in range(10)] + requests = create_requests( + num_requests=10, + num_tokens=200, + mm_hashes_list=mm_hashes_list, + mm_positions=mm_positions, + ) + for request in requests: + scheduler.add_request(request) + + # Set up to test different encoder cache exsistence scenario after preemption + # Order of getting encoder cache should be: local cache -> connector-> compute + scheduler.ec_connector.update_state_after_alloc = Mock( + wraps=scheduler.ec_connector.update_state_after_alloc + ) + + if cache_exist == "local": + # Allocate cache to cache manager manually to mimick + for req in requests: + scheduler.encoder_cache_manager.allocate(req, 0) + else: + # Make sure local encoder cache empty + scheduler.encoder_cache_manager.cached = {} + + if cache_exist == "connector_only": + # Cache exist in ec_connector + scheduler.ec_connector.has_caches = Mock(return_value=[True]) + elif cache_exist == "no_where": + scheduler.ec_connector.has_caches = Mock(return_value=[False]) + + output = scheduler.schedule() + assert len(output.scheduled_new_reqs) == len(requests) + assert output.scheduled_cached_reqs.num_reqs == 0 + assert len(output.finished_req_ids) == 0 + for req_id, num_tokens in output.num_scheduled_tokens.items(): + assert num_tokens == len(requests[int(req_id)].prompt_token_ids) + + ## Encoder-cache-specific checks: + # mm_hashes of requests exist in cache after scheduling for all scenario + _assert_right_encoder_cache_allocated(scheduler, requests) + + # Should only call update_state_after_alloc when loaded externally + if cache_exist == "connector_only": + scheduler.ec_connector.update_state_after_alloc.assert_called_with( + requests[-1], 0 + ) + + # Concat mm_features for the 10 requests together + mm_features_list = [feature for req in requests for feature in req.mm_features] + + # Check metadata should contain mm data for all 10 requests + _assert_right_ec_connector_metadata(output, mm_features_list=mm_features_list) + else: + scheduler.ec_connector.update_state_after_alloc.assert_not_called() + # ECConnector should carry no metadata + _assert_right_ec_connector_metadata(output, mm_features_list=[]) + + scheduler.ec_connector.update_state_after_alloc.reset_mock() + + # Should only schedule encoder input when cache is not found anywhere + if cache_exist == "no_where": + _assert_right_encoder_inputs( + output, + requests=requests, + expected_encoder_inputs=[[0] for _ in range(10)], + expected_total_reqs=10, + ) + else: + _assert_right_encoder_inputs(output, expected_total_reqs=0) + + +@pytest.mark.parametrize("use_kv_connector", [False, True]) +def test_ec_connector_unable_to_allocate(use_kv_connector): + """ + Test whether scheduler with ECConnector is able to handle + unable to allocate (run out of blocks). + """ + + # Setup Scheduler With Mock External Cache Hit. + BLOCK_SIZE = 4 + NUM_BLOCKS = 10 + scheduler = create_scheduler( + model="llava-hf/llava-1.5-7b-hf", + enable_prefix_caching=True, + use_kv_connector=use_kv_connector, + block_size=BLOCK_SIZE, + num_blocks=NUM_BLOCKS, + use_ec_connector=True, + ec_role="ec_consumer", + ) + + # Mock ec_connector load external cache behavior + scheduler.ec_connector.has_caches = Mock(return_value=[True]) + scheduler.ec_connector.update_state_after_alloc = Mock( + wraps=scheduler.ec_connector.update_state_after_alloc + ) + + # Create two requests. The second request will not be able to + # allocate slots because it will not have enough blocks. + NUM_REQUESTS = 2 + NUM_TOKENS = (NUM_BLOCKS // 2 + 1) * BLOCK_SIZE + MAX_TOKENS = 2 + requests = create_requests( + num_requests=NUM_REQUESTS, + num_tokens=NUM_TOKENS, + mm_hashes_list=[["hash_1"], ["hash_2"]], + mm_positions=[ + [PlaceholderRange(offset=1, length=10)] for _ in range(NUM_REQUESTS) + ], + max_tokens=MAX_TOKENS, + block_size=BLOCK_SIZE, + ) + req_ids = [] + req_to_index = {} + for i, request in enumerate(requests): + scheduler.add_request(request) + req_ids.append(request.request_id) + req_to_index[request.request_id] = i + + # Setup MODEL_RUNNER_OUTPUT to be run in _step_until_done later + MODEL_RUNNER_OUTPUT = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_to_index, + sampled_token_ids=[[1000]] * len(req_ids), + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[], + ) + + # Just one request should be running. + output = scheduler.schedule() + scheduled_tokens = output.num_scheduled_tokens[scheduler.running[0].request_id] + assert scheduled_tokens == NUM_TOKENS + assert len(scheduler.running) == 1 + assert len(scheduler.waiting) == 1 + + # Should call update_state_after_alloc for external load + scheduler.ec_connector.update_state_after_alloc.assert_called_with( + scheduler.running[0], 0 + ) + scheduler.ec_connector.update_state_after_alloc.reset_mock() + + # All memory should be freed, with one request waiting. + _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT) + assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == NUM_BLOCKS - 1 + assert len(scheduler.running) == 0 + assert len(scheduler.waiting) == 1 + + # Just one request should be running. + output = scheduler.schedule() + scheduled_tokens = output.num_scheduled_tokens[scheduler.running[0].request_id] + assert scheduled_tokens == NUM_TOKENS + assert len(scheduler.running) == 1 + assert len(scheduler.waiting) == 0 + + # update_state_after_alloc should be called for loading external cache + scheduler.ec_connector.update_state_after_alloc.assert_called_with( + scheduler.running[0], 0 + ) + scheduler.ec_connector.update_state_after_alloc.reset_mock() + + # All memory should be freed, with no requests waiting / running. + _step_until_done(scheduler, output, MODEL_RUNNER_OUTPUT) + assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == NUM_BLOCKS - 1 + assert len(scheduler.running) == 0 + assert len(scheduler.waiting) == 0 + + +@pytest.mark.parametrize("cache_exist", ["local", "connector_only", "no_where"]) +@pytest.mark.parametrize("use_kv_connector", [False, True]) +def test_priority_scheduling_ec_connector_preemption_and_resumption( + cache_exist, use_kv_connector +): + """Test that priority scheduling preempts lower priority requests + when out of KV cache space.""" + # Create scheduler with very limited memory to force preemption + scheduler = create_scheduler_with_priority( + model="llava-hf/llava-1.5-7b-hf", + enable_prefix_caching=True, + max_num_seqs=2, # allow multiple requests + # kv connector should not effect test results + use_kv_connector=use_kv_connector, + num_blocks=15, # can hold 244 tokens with 14 blocks (first block is null) + block_size=16, # standard block size + use_ec_connector=True, + ec_role="ec_consumer", + ) + + # Mock cache hit: Both cache exist in connector (at E->PD initially) + scheduler.ec_connector.has_caches = Mock(return_value=[True]) + scheduler.ec_connector.update_state_after_alloc = Mock( + wraps=scheduler.ec_connector.update_state_after_alloc + ) + + # Create a request and schedule it (and to be preempted) + request_low = create_requests_with_priority( + num_requests=1, + priorities=[1], + arrival_times=[0.0], + num_tokens=94, + mm_hashes_list=[["hash_low"]], + # NOTE: this test only preempt the last block. + # Setting mm_position at the last block can force to recompute encoding + mm_positions=[[PlaceholderRange(offset=82, length=10)]], + starting_idx=0, + )[0] + scheduler.add_request(request_low) + # 1st schedule + output = scheduler.schedule() + + assert len(output.scheduled_new_reqs) == 1 + scheduled_tokens = output.num_scheduled_tokens[request_low.request_id] + assert scheduled_tokens == 94 + assert len(scheduler.waiting) == 0 + assert len(scheduler.running) == 1 + + ## Encoder-cache-specific checks: + # Encoder cache should contain mm items from request + _assert_right_encoder_cache_allocated(scheduler, requests=[request_low]) + + # Verify update_state_after_alloc called (external load) + scheduler.ec_connector.update_state_after_alloc.assert_called_with(request_low, 0) + scheduler.ec_connector.update_state_after_alloc.reset_mock() + + # ECConnector should carry metadata of request + _assert_right_ec_connector_metadata( + output, mm_features_list=request_low.mm_features + ) + + # Scheduled encoder input should be empty; no mm to compute + _assert_right_encoder_inputs(output, expected_total_reqs=0) + + # Simulate model execution - 1st decode + model_output = ModelRunnerOutput( + req_ids=[request_low.request_id], + req_id_to_index={request_low.request_id: 0}, + sampled_token_ids=[[100]], + # spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[], + ) + scheduler.update_from_output(output, model_output) + + # Create a high priority request and schedule it + request_high = create_requests_with_priority( + num_requests=1, + priorities=[0], + arrival_times=[1.0], + num_tokens=128, + mm_hashes_list=[["hash_high"]], + mm_positions=[[PlaceholderRange(offset=1, length=10)]], + max_tokens=2, + starting_idx=1, + )[0] + scheduler.add_request(request_high) + # 2nd schedule + output = scheduler.schedule() + + # KV cache should be full at this point + assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == 0 + assert len(output.scheduled_new_reqs) == 1 + assert output.scheduled_cached_reqs.num_reqs == 1 + assert len(scheduler.waiting) == 0 + assert len(scheduler.running) == 2 + + ## Encoder-cache-specific checks: + # Encoder cache should contain mm items from request + _assert_right_encoder_cache_allocated(scheduler, requests=[request_high]) + + # Verify update_state_after_alloc called (external load) + scheduler.ec_connector.update_state_after_alloc.assert_called_with(request_high, 0) + scheduler.ec_connector.update_state_after_alloc.reset_mock() + + # ECConnector should carry metadata of request + _assert_right_ec_connector_metadata( + output, mm_features_list=request_high.mm_features + ) + + # Scheduled encoder input should be empty; no mm to compute + _assert_right_encoder_inputs(output, expected_total_reqs=0) + + # Simulate model execution - 2nd decode + requests = [request_low, request_high] + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={req.request_id: i for i, req in enumerate(requests)}, + sampled_token_ids=[[100] for _ in requests], + # spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[], + ) + scheduler.update_from_output(output, model_output) + + # 3rd schedule - - this should trigger preemption + # req_low needs 96 tokens = 6 blocks + # req_high needs 129 tokens = 9 blocks + # so doesn't fit in 14 blocks. + output = scheduler.schedule() + + # Should have preempted req_low + assert len(output.scheduled_new_reqs) == 0 + assert output.scheduled_cached_reqs.num_reqs == 1 + assert output.scheduled_cached_reqs.req_ids[0] == request_high.request_id + assert scheduler.requests[request_low.request_id].status == RequestStatus.PREEMPTED + assert len(scheduler.waiting) == 1 + assert len(scheduler.running) == 1 + + ## Encoder-cache-specific checks: + # request_high is in decode phase now + # ECConnector should carry no metadata + _assert_right_ec_connector_metadata(output, mm_features_list=[]) + + # Scheduled encoder input should be empty; no mm to compute + _assert_right_encoder_inputs(output, expected_total_reqs=0) + + # Simulate model execution - 3rd decode, after req_low was preempted + requests = [request_low, request_high] + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={req.request_id: i for i, req in enumerate(requests)}, + sampled_token_ids=[[100], [100, 200]], + # spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[], + ) + # Finish the requests to make room for the preempted requests to resume + # req_high is finished after outputing 2 tokens + scheduler.update_from_output(output, model_output) + scheduler.finish_requests( + request_high.request_id, RequestStatus.FINISHED_LENGTH_CAPPED + ) + + # Set up to test different encoder cache exsistence scenario after preemption + # Order of getting encoder cache should be: local cache -> connector-> compute + # By default, the cache should still exist in local in this test case + if cache_exist != "local": + # Make local encoder cache empty + scheduler.encoder_cache_manager.cached = {} + + if cache_exist == "connector_only": + # Cache exist in ec_connector + scheduler.ec_connector.has_caches = Mock(return_value=[True]) + elif cache_exist == "no_where": + scheduler.ec_connector.has_caches = Mock(return_value=[False]) + + # 4th Schedule - this should trigger req_low resumption from waiting + output = scheduler.schedule() + scheduled_cached_reqs = output.scheduled_cached_reqs + resumed_from_preemption = scheduled_cached_reqs.resumed_from_preemption + + assert len(output.scheduled_new_reqs) == 0 + assert scheduled_cached_reqs.num_reqs == 1 + assert len(scheduler.waiting) == 0 + assert len(scheduler.running) == 1 + + # Preempted request resumed in scheduled_cached_reqs + assert len(resumed_from_preemption) == 1 + assert len(scheduled_cached_reqs.resumed_req_token_ids) == 1 + assert resumed_from_preemption[0] + assert scheduled_cached_reqs.req_ids[0] == request_low.request_id + assert scheduled_cached_reqs.resumed_req_token_ids[0] is not None + ## Resumed tokens include 94 prompt tokens and 2 decoded tokens + assert len(scheduled_cached_reqs.resumed_req_token_ids[0]) == 96 + assert scheduled_cached_reqs.resumed_req_token_ids[0][95] == 100 + assert scheduler.running[0].request_id == request_low.request_id + assert request_high.request_id in output.finished_req_ids + + ## Encoder-cache-specific checks: + # mm_hash of request_low exists in cache after scheduling for all scenario + _assert_right_encoder_cache_allocated(scheduler, [request_low]) + + # Should only call update_state_after_alloc when loaded externally + if cache_exist == "connector_only": + scheduler.ec_connector.update_state_after_alloc.assert_called_with( + request_low, 0 + ) + _assert_right_ec_connector_metadata( + output, mm_features_list=request_low.mm_features + ) + else: + scheduler.ec_connector.update_state_after_alloc.assert_not_called() + # ECConnector should carry no metadata + _assert_right_ec_connector_metadata(output, mm_features_list=[]) + + scheduler.ec_connector.update_state_after_alloc.reset_mock() + + # Should only schedule encoder input when cache is not found anywhere + if cache_exist == "no_where": + _assert_right_encoder_inputs( + output, + requests=[request_low], + expected_encoder_inputs=[[0]], + expected_total_reqs=1, + ) + else: + _assert_right_encoder_inputs(output, expected_total_reqs=0) + + +# ============================================================================== +# EPD (Encoder-Prefill-Decode) Encoder-cache-specific tests end +# ============================================================================== diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py index c7df43359381..812442783a80 100644 --- a/tests/v1/core/utils.py +++ b/tests/v1/core/utils.py @@ -5,6 +5,7 @@ from vllm.config import ( CacheConfig, + ECTransferConfig, KVTransferConfig, ModelConfig, SchedulerConfig, @@ -17,7 +18,6 @@ PlaceholderRange, ) from vllm.sampling_params import SamplingParams -from vllm.utils import sha256 from vllm.v1.core.kv_cache_utils import get_request_block_hasher, init_none_hash from vllm.v1.core.sched.async_scheduler import AsyncScheduler from vllm.v1.core.sched.scheduler import Scheduler @@ -46,6 +46,8 @@ def create_scheduler( num_speculative_tokens: int | None = None, skip_tokenizer_init: bool = False, async_scheduling: bool = False, + use_ec_connector: bool = False, + ec_role: str | None = None, ) -> Scheduler | AsyncScheduler: """Create scheduler under test. @@ -107,12 +109,23 @@ def create_scheduler( model="ngram", num_speculative_tokens=num_speculative_tokens ) + ec_transfer_config = ( + ECTransferConfig( + ec_connector="ECSharedStorageConnector", + ec_role=ec_role, + ec_connector_extra_config={"shared_storage_path": "/tmp/ec_test"}, + ) + if use_ec_connector + else None + ) + vllm_config = VllmConfig( scheduler_config=scheduler_config, model_config=model_config, cache_config=cache_config, kv_transfer_config=kv_transfer_config, speculative_config=speculative_config, + ec_transfer_config=ec_transfer_config, ) kv_cache_config = KVCacheConfig( num_blocks=num_blocks, # A large number of blocks to hold all requests @@ -140,6 +153,7 @@ def create_scheduler( def create_requests( num_requests: int, num_tokens: int = 10, + mm_hashes_list: list[list[str]] | None = None, mm_positions: list[list[PlaceholderRange]] | None = None, max_tokens: int = 16, stop_token_ids: list[int] | None = None, @@ -160,21 +174,49 @@ def create_requests( prompt_logprobs=prompt_logprobs, ) requests = [] + + if mm_hashes_list is not None: + # NOTE: allow manual input; some mm items can have the same identifier + # no. of mm_hashes and mm_positions for each request should be identical + assert mm_positions is not None, ( + "mm_positions must be provided when mm_hashes_list is provided" + ) + assert len(mm_hashes_list) == len(mm_positions) == num_requests + assert [len(h) for h in mm_hashes_list] == [len(p) for p in mm_positions] + + # Since same identifier would imply they are identical encoder output + # Verify mm items with identical identifier are having mm_position.length + seen_hashes: dict[str, int] = {} + for i in range(num_requests): mm_features = [] - if mm_positions is not None: - mm_position = mm_positions[i] - for j, position in enumerate(mm_position): - # Dummy hash for each mm item should be unique - # since encoder cache tracks entries by hash + + for j, position in enumerate( + mm_positions[i] if mm_positions is not None else [] + ): + if mm_hashes_list is not None: + identifier = mm_hashes_list[i][j] + + # Verify if position length is identical + position_length = position.length + if identifier in seen_hashes: + assert seen_hashes[identifier] == position_length, ( + f"mm_hash '{identifier}' has inconsistent position lengths: " + f"previously {seen_hashes[identifier]}, now {position_length} " + f"at request {i}, position {j}" + ) + else: + seen_hashes[identifier] = position_length + else: + # Unique dummy hash for each mm item identifier = f"hash{i}_{j}" - mm_feature = MultiModalFeatureSpec( - data=MultiModalKwargsItem.dummy("dummy_m"), - mm_position=position, - identifier=identifier, - modality="image", - ) - mm_features.append(mm_feature) + mm_feature = MultiModalFeatureSpec( + data=MultiModalKwargsItem.dummy("dummy_m"), + mm_position=position, + identifier=identifier, + modality="image", + ) + mm_features.append(mm_feature) prompt_token_ids = [0] * num_tokens if same_prompt else [i] * num_tokens request = Request( diff --git a/tests/v1/ec_connector/unit/test_ec_shared_storage_connector.py b/tests/v1/ec_connector/unit/test_ec_shared_storage_connector.py new file mode 100644 index 000000000000..a58daa2628e2 --- /dev/null +++ b/tests/v1/ec_connector/unit/test_ec_shared_storage_connector.py @@ -0,0 +1,609 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Unit tests for ECSharedStorageConnector. +""" + +import os +from unittest.mock import Mock, patch + +import pytest +import safetensors +import torch + +from vllm.config import VllmConfig +from vllm.distributed.ec_transfer.ec_connector.base import ECConnectorRole +from vllm.distributed.ec_transfer.ec_connector.shared_storage_connector import ( + ECSharedStorageConnector, + ECSharedStorageConnectorMetadata, + MMMeta, +) +from vllm.multimodal.inputs import MultiModalFeatureSpec, PlaceholderRange +from vllm.v1.core.sched.output import SchedulerOutput + + +# ------------------ Mock Classes ------------------ # +class MockRequest: + def __init__(self, request_id, mm_hashes: list[str], token_counts: list[int]): + assert len(mm_hashes) == len(token_counts) + self.request_id = request_id + self._token_counts = token_counts + self.mm_features = [] + for i, mm_hash in enumerate(mm_hashes): + feature = MultiModalFeatureSpec( + data=None, + modality="image", + identifier=mm_hash, + mm_position=PlaceholderRange(offset=0, length=self._token_counts[i]), + ) + self.mm_features.append(feature) + + def get_num_encoder_tokens(self, input_id: int) -> int: + assert input_id < len(self._token_counts) + return self._token_counts[input_id] + + +@pytest.fixture +def temp_storage(tmp_path): + """Fixture providing temporary storage path.""" + return str(tmp_path) + + +@pytest.fixture +def mock_vllm_config_producer(temp_storage): + """Fixture providing mock VllmConfig for producer role.""" + config = Mock(spec=VllmConfig) + config.ec_transfer_config = Mock() + config.ec_transfer_config.get_from_extra_config = Mock(return_value=temp_storage) + config.ec_transfer_config.is_ec_producer = True + return config + + +@pytest.fixture +def mock_vllm_config_consumer(temp_storage): + """Fixture providing mock VllmConfig for consumer role.""" + config = Mock(spec=VllmConfig) + config.ec_transfer_config = Mock() + config.ec_transfer_config.get_from_extra_config = Mock(return_value=temp_storage) + config.ec_transfer_config.is_ec_producer = False + return config + + +@pytest.fixture +def mock_request_with_3_mm(): + """Fixture providing mock Request with 3 multimodal items.""" + request_id = "test_req_123" + mm_hashes = ["img_hash_1", "img_hash_2", "img_hash_3"] + token_counts = [100, 150, 200] + + request = MockRequest(request_id, mm_hashes, token_counts) + return request + + +# ------------------ Unit Tests ------------------ # +class TestECSharedStorageConnectorBasics: + """Test basic EC connector functionality.""" + + def test_initialization_producer(self, mock_vllm_config_producer, temp_storage): + """Test connector initializes correctly as producer.""" + connector = ECSharedStorageConnector( + vllm_config=mock_vllm_config_producer, + role=ECConnectorRole.SCHEDULER, + ) + + assert connector.role == ECConnectorRole.SCHEDULER + assert connector.is_producer + assert connector._storage_path == temp_storage + assert connector._mm_datas_need_loads == {} + + def test_initialization_consumer(self, mock_vllm_config_consumer, temp_storage): + """Test connector initializes correctly as consumer.""" + connector = ECSharedStorageConnector( + vllm_config=mock_vllm_config_consumer, + role=ECConnectorRole.WORKER, + ) + + assert connector.role == ECConnectorRole.WORKER + assert not connector.is_producer + assert connector._storage_path == temp_storage + + def test_role_assignment(self, mock_vllm_config_producer): + """Test role is correctly assigned.""" + scheduler_connector = ECSharedStorageConnector( + vllm_config=mock_vllm_config_producer, + role=ECConnectorRole.SCHEDULER, + ) + worker_connector = ECSharedStorageConnector( + vllm_config=mock_vllm_config_producer, + role=ECConnectorRole.WORKER, + ) + + assert scheduler_connector.role == ECConnectorRole.SCHEDULER + assert worker_connector.role == ECConnectorRole.WORKER + + +class TestCacheExistence: + """Test cache existence checking using has_caches() API.""" + + def test_has_caches_all_exist_3_items( + self, + mock_vllm_config_producer, + mock_vllm_config_consumer, + mock_request_with_3_mm, + ): + """Test has_caches returns True when all 3 caches exist.""" + # Test for producer first + producer = ECSharedStorageConnector( + vllm_config=mock_vllm_config_producer, + role=ECConnectorRole.SCHEDULER, + ) + + # Create cache files using save_caches (proper way) + encoder_cache: dict[str, torch.Tensor] = {} + + for mm_feature in mock_request_with_3_mm.mm_features: + mm_hash = mm_feature.identifier + encoder_cache[mm_hash] = torch.randn(10, 768) + producer.save_caches(encoder_cache, mm_hash) + + # Test using has_caches API + producer_result = producer.has_caches(mock_request_with_3_mm) + + # Assert + assert len(producer_result) == 3 + assert all(producer_result), f"Expected all True, got {producer_result}" + + # Also test consumer can check if cache exists + consumer = ECSharedStorageConnector( + vllm_config=mock_vllm_config_consumer, + role=ECConnectorRole.SCHEDULER, + ) + + # Test using has_caches API + consumer_result = consumer.has_caches(mock_request_with_3_mm) + + # Assert + assert len(consumer_result) == 3 + assert all(consumer_result), f"Expected all True, got {consumer_result}" + + def test_has_caches_none_exist( + self, mock_vllm_config_producer, mock_request_with_3_mm + ): + """Test has_caches returns False when no caches exist.""" + connector = ECSharedStorageConnector( + vllm_config=mock_vllm_config_producer, + role=ECConnectorRole.SCHEDULER, + ) + + # Test without creating any files + result = connector.has_caches(mock_request_with_3_mm) + + # Assert + assert len(result) == 3 + assert not any(result), f"Expected all False, got {result}" + + def test_has_caches_partial_exist( + self, mock_vllm_config_producer, mock_request_with_3_mm + ): + """Test has_caches with some caches existing (1 of 3).""" + connector = ECSharedStorageConnector( + vllm_config=mock_vllm_config_producer, + role=ECConnectorRole.SCHEDULER, + ) + + # Create only the second cache file + mm_hash_second = mock_request_with_3_mm.mm_features[1].identifier + encoder_cache = {mm_hash_second: torch.randn(10, 768)} + connector.save_caches(encoder_cache, mm_hash_second) + + # Test + result = connector.has_caches(mock_request_with_3_mm) + + # Assert + assert len(result) == 3 + assert not result[0] # First doesn't exist + assert result[1] # Second exists + assert not result[2] # Third doesn't exist + + +class TestStateManagement: + """Test connector state management.""" + + def test_update_state_after_alloc_3_items( + self, mock_vllm_config_producer, mock_request_with_3_mm + ): + """Test state update after allocation for 3 MM items.""" + connector = ECSharedStorageConnector( + vllm_config=mock_vllm_config_producer, + role=ECConnectorRole.SCHEDULER, + ) + + # Initial state should be empty + assert len(connector._mm_datas_need_loads) == 0 + + # Update state for all 3 items + for i in range(3): + connector.update_state_after_alloc(mock_request_with_3_mm, index=i) + + # Check state updated for all 3 + assert len(connector._mm_datas_need_loads) == 3 + assert "img_hash_1" in connector._mm_datas_need_loads + assert "img_hash_2" in connector._mm_datas_need_loads + assert "img_hash_3" in connector._mm_datas_need_loads + assert connector._mm_datas_need_loads["img_hash_1"] == 100 + assert connector._mm_datas_need_loads["img_hash_2"] == 150 + assert connector._mm_datas_need_loads["img_hash_3"] == 200 + + def test_build_connector_meta_3_items( + self, mock_vllm_config_producer, mock_request_with_3_mm + ): + """Test metadata building for 3 MM items.""" + connector = ECSharedStorageConnector( + vllm_config=mock_vllm_config_producer, + role=ECConnectorRole.SCHEDULER, + ) + + # Setup state for all 3 items + for i in range(3): + connector.update_state_after_alloc(mock_request_with_3_mm, index=i) + + # Build metadata + scheduler_output = Mock(spec=SchedulerOutput) + metadata = connector.build_connector_meta(scheduler_output) + + # Assert + assert isinstance(metadata, ECSharedStorageConnectorMetadata) + assert len(metadata.mm_datas) == 3 + assert metadata.mm_datas[0].mm_hash == "img_hash_1" + assert metadata.mm_datas[0].num_token == 100 + assert metadata.mm_datas[1].mm_hash == "img_hash_2" + assert metadata.mm_datas[1].num_token == 150 + assert metadata.mm_datas[2].mm_hash == "img_hash_3" + assert metadata.mm_datas[2].num_token == 200 + + # State should be cleared after building + assert len(connector._mm_datas_need_loads) == 0 + + def test_build_connector_meta_empty(self, mock_vllm_config_producer): + """Test metadata building with empty state.""" + connector = ECSharedStorageConnector( + vllm_config=mock_vllm_config_producer, + role=ECConnectorRole.SCHEDULER, + ) + + scheduler_output = Mock(spec=SchedulerOutput) + metadata = connector.build_connector_meta(scheduler_output) + + assert isinstance(metadata, ECSharedStorageConnectorMetadata) + assert len(metadata.mm_datas) == 0 + + def test_state_cleared_after_metadata_build( + self, mock_vllm_config_producer, mock_request_with_3_mm + ): + """Test that state is properly cleared after building metadata.""" + connector = ECSharedStorageConnector( + vllm_config=mock_vllm_config_producer, + role=ECConnectorRole.SCHEDULER, + ) + + # Add state + for i in range(3): + connector.update_state_after_alloc(mock_request_with_3_mm, index=i) + assert len(connector._mm_datas_need_loads) == 3 + + # Build metadata (should clear state) + scheduler_output = Mock(spec=SchedulerOutput) + connector.build_connector_meta(scheduler_output) + + # State should be empty + assert len(connector._mm_datas_need_loads) == 0 + + # Build again should return empty metadata + metadata2 = connector.build_connector_meta(scheduler_output) + assert len(metadata2.mm_datas) == 0 + + +class TestCacheSaving: + """Test encoder cache saving (producer only).""" + + def test_save_caches_producer_3_items( + self, mock_vllm_config_producer, mock_request_with_3_mm, temp_storage + ): + """Test cache saving as producer for 3 different MM items.""" + connector = ECSharedStorageConnector( + vllm_config=mock_vllm_config_producer, + role=ECConnectorRole.WORKER, + ) + + # Create and save 3 different caches + mm_hashes = [f.identifier for f in mock_request_with_3_mm.mm_features] + encoder_cache: dict[str, torch.Tensor] = {} + + for mm_hash in mm_hashes: + encoder_cache[mm_hash] = torch.randn(10, 768) + connector.save_caches(encoder_cache, mm_hash) + + # Verify all files exist using has_caches + result = connector.has_caches(mock_request_with_3_mm) + assert all(result), f"Not all caches were saved: {result}" + + # Verify each file's content + for mm_hash in mm_hashes: + filename = connector._generate_filename_debug(mm_hash) + loaded = safetensors.torch.load_file(filename) + assert "ec_cache" in loaded + assert torch.allclose(loaded["ec_cache"], encoder_cache[mm_hash].cpu()) + + def test_save_caches_consumer_skips(self, mock_vllm_config_consumer): + """Test cache saving is skipped for consumer.""" + connector = ECSharedStorageConnector( + vllm_config=mock_vllm_config_consumer, + role=ECConnectorRole.WORKER, + ) + + mm_hash = "test_hash_consumer" + encoder_cache = {mm_hash: torch.randn(10, 768)} + + # Save should not raise but also not create file + connector.save_caches(encoder_cache, mm_hash) + + # Verify file doesn't exist using has_caches + mock_request = MockRequest("req_consumer", [mm_hash], [10]) + result = connector.has_caches(mock_request) + assert not result[0], "Consumer should not save caches" + + +class TestCacheLoading: + """Test encoder cache loading (consumer).""" + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + def test_start_load_caches_consumer_3_items( + self, + mock_vllm_config_producer, + mock_vllm_config_consumer, + mock_request_with_3_mm, + temp_storage, + ): + """Test consumer loads 3 caches from storage.""" + # First, create producer to save caches + producer = ECSharedStorageConnector( + vllm_config=mock_vllm_config_producer, + role=ECConnectorRole.WORKER, + ) + + # Producer saves 3 caches + mm_hashes = [f.identifier for f in mock_request_with_3_mm.mm_features] + saved_caches = {} + for mm_hash in mm_hashes: + saved_caches[mm_hash] = torch.randn(10, 768) + producer.save_caches(saved_caches, mm_hash) + + # Now consumer loads + consumer = ECSharedStorageConnector( + vllm_config=mock_vllm_config_consumer, + role=ECConnectorRole.WORKER, + ) + + # Setup metadata for all 3 + metadata = ECSharedStorageConnectorMetadata() + for mm_hash in mm_hashes: + metadata.add_mm_data(MMMeta.make_meta(mm_hash, 100)) + consumer.bind_connector_metadata(metadata) + + # Load + encoder_cache: dict[str, torch.Tensor] = {} + consumer.start_load_caches(encoder_cache=encoder_cache) + + # Verify all 3 loaded + assert len(encoder_cache) == 3 + for mm_hash in mm_hashes: + assert mm_hash in encoder_cache, f"{mm_hash} missing in encoder_cache" + assert encoder_cache[mm_hash].is_cuda, ( + f"{mm_hash} cache is in {encoder_cache[mm_hash].device}" + ) + assert torch.allclose( + encoder_cache[mm_hash].cpu(), saved_caches[mm_hash] + ), f"{mm_hash} cache saved and loaded tesnor are not the same" + + def test_start_load_caches_skip_existing( + self, mock_vllm_config_producer, mock_vllm_config_consumer, temp_storage + ): + """Test cache loading skips already cached items.""" + # Setup: producer saves cache + producer = ECSharedStorageConnector( + vllm_config=mock_vllm_config_producer, + role=ECConnectorRole.WORKER, + ) + + mm_hash = "existing_hash" + saved_cache = torch.randn(10, 768) + producer.save_caches({mm_hash: saved_cache}, mm_hash) + + # Consumer setup + consumer = ECSharedStorageConnector( + vllm_config=mock_vllm_config_consumer, + role=ECConnectorRole.WORKER, + ) + + metadata = ECSharedStorageConnectorMetadata() + metadata.add_mm_data(MMMeta.make_meta(mm_hash, 100)) + consumer.bind_connector_metadata(metadata) + + # Pre-populate encoder_cache with different value + existing_cache = torch.randn(5, 512) + encoder_cache = {mm_hash: existing_cache} + + # Load (should skip since already exists) + with patch("safetensors.torch.load_file") as mock_load: + consumer.start_load_caches(encoder_cache=encoder_cache) + # Should not call load_file since cache exists + mock_load.assert_not_called() + + # Verify original cache unchanged + assert torch.equal(encoder_cache[mm_hash], existing_cache) + + def test_start_load_caches_empty_metadata(self, mock_vllm_config_consumer): + """Test loading with empty metadata does nothing.""" + consumer = ECSharedStorageConnector( + vllm_config=mock_vllm_config_consumer, + role=ECConnectorRole.WORKER, + ) + + # Setup empty metadata + metadata = ECSharedStorageConnectorMetadata() + consumer.bind_connector_metadata(metadata) + + # Load (should not raise) + encoder_cache: dict[str, torch.Tensor] = {} + consumer.start_load_caches(encoder_cache=encoder_cache) + + # Cache should remain empty + assert len(encoder_cache) == 0 + + +class TestFilenameGeneration: + """Test filename and path generation.""" + + def test_generate_foldername(self, mock_vllm_config_producer, temp_storage): + """Test folder name generation.""" + connector = ECSharedStorageConnector( + vllm_config=mock_vllm_config_producer, + role=ECConnectorRole.WORKER, + ) + + mm_hash = "test_folder_hash" + folder = connector._generate_foldername_debug(mm_hash) + + assert folder == os.path.join(temp_storage, mm_hash) + assert os.path.isdir(folder) # Should be created + + def test_generate_filename(self, mock_vllm_config_producer, temp_storage): + """Test filename generation.""" + connector = ECSharedStorageConnector( + vllm_config=mock_vllm_config_producer, + role=ECConnectorRole.WORKER, + ) + + mm_hash = "test_file_hash" + filename = connector._generate_filename_debug(mm_hash) + + expected = os.path.join(temp_storage, mm_hash, "encoder_cache.safetensors") + assert filename == expected + assert os.path.isdir(os.path.dirname(filename)) # Folder created + + def test_generate_filename_consistency(self, mock_vllm_config_producer): + """Test filename generation is consistent.""" + connector = ECSharedStorageConnector( + vllm_config=mock_vllm_config_producer, + role=ECConnectorRole.WORKER, + ) + + mm_hash = "consistency_hash" + filename1 = connector._generate_filename_debug(mm_hash) + filename2 = connector._generate_filename_debug(mm_hash) + + assert filename1 == filename2 + + +class TestMetadataBindingLifecycle: + """Test metadata binding and clearing lifecycle.""" + + def test_bind_connector_metadata(self, mock_vllm_config_consumer): + """Test binding connector metadata.""" + connector = ECSharedStorageConnector( + vllm_config=mock_vllm_config_consumer, + role=ECConnectorRole.WORKER, + ) + + metadata = ECSharedStorageConnectorMetadata() + metadata.add_mm_data(MMMeta.make_meta("hash_1", 100)) + + connector.bind_connector_metadata(metadata) + + assert connector._connector_metadata is metadata + + def test_clear_connector_metadata(self, mock_vllm_config_consumer): + """Test clearing connector metadata.""" + connector = ECSharedStorageConnector( + vllm_config=mock_vllm_config_consumer, + role=ECConnectorRole.WORKER, + ) + + metadata = ECSharedStorageConnectorMetadata() + connector.bind_connector_metadata(metadata) + + connector.clear_connector_metadata() + + assert connector._connector_metadata is None + + def test_get_connector_metadata(self, mock_vllm_config_consumer): + """Test getting connector metadata.""" + connector = ECSharedStorageConnector( + vllm_config=mock_vllm_config_consumer, + role=ECConnectorRole.WORKER, + ) + + metadata = ECSharedStorageConnectorMetadata() + connector.bind_connector_metadata(metadata) + + retrieved = connector._get_connector_metadata() + + assert retrieved is metadata + + def test_get_connector_metadata_not_set(self, mock_vllm_config_consumer): + """Test getting metadata when not set raises.""" + connector = ECSharedStorageConnector( + vllm_config=mock_vllm_config_consumer, + role=ECConnectorRole.WORKER, + ) + + with pytest.raises(AssertionError): + connector._get_connector_metadata() + + +class TestEdgeCases: + """Test edge cases and error handling.""" + + def test_save_empty_cache(self, mock_vllm_config_producer): + """Test saving empty tensor.""" + connector = ECSharedStorageConnector( + vllm_config=mock_vllm_config_producer, + role=ECConnectorRole.WORKER, + ) + + mm_hash = "empty_hash" + encoder_cache = {mm_hash: torch.empty(0)} + + # Should not raise + connector.save_caches(encoder_cache, mm_hash) + + def test_load_nonexistent_cache(self, mock_vllm_config_consumer): + """Test loading cache that doesn't exist raises error.""" + connector = ECSharedStorageConnector( + vllm_config=mock_vllm_config_consumer, + role=ECConnectorRole.WORKER, + ) + + metadata = ECSharedStorageConnectorMetadata() + metadata.add_mm_data(MMMeta.make_meta("nonexistent_hash", 100)) + connector.bind_connector_metadata(metadata) + + encoder_cache: dict[str, torch.Tensor] = {} + + # Should raise FileNotFoundError + with pytest.raises(FileNotFoundError): + connector.start_load_caches(encoder_cache=encoder_cache) + + def test_has_caches_empty_request(self, mock_vllm_config_producer): + """Test has_caches with request that has no MM data.""" + connector = ECSharedStorageConnector( + vllm_config=mock_vllm_config_producer, + role=ECConnectorRole.SCHEDULER, + ) + + mock_request = MockRequest("req_empty", [], []) + + result = connector.has_caches(mock_request) + + assert len(result) == 0 + assert result == [] diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 6a0197d044dc..b4c57436c95d 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -9,6 +9,7 @@ PassConfig, ) from vllm.config.device import DeviceConfig +from vllm.config.ec_transfer import ECTransferConfig from vllm.config.kv_events import KVEventsConfig from vllm.config.kv_transfer import KVTransferConfig from vllm.config.load import LoadConfig @@ -54,6 +55,8 @@ "PassConfig", # From vllm.config.device "DeviceConfig", + # From vllm.config.ec_transfer + "ECTransferConfig", # From vllm.config.kv_events "KVEventsConfig", # From vllm.config.kv_transfer diff --git a/vllm/config/ec_transfer.py b/vllm/config/ec_transfer.py new file mode 100644 index 000000000000..d95236f818ab --- /dev/null +++ b/vllm/config/ec_transfer.py @@ -0,0 +1,110 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import hashlib +import uuid +from dataclasses import field +from typing import Any, Literal, get_args + +from pydantic.dataclasses import dataclass + +from vllm.config.utils import config + +ECProducer = Literal["ec_producer"] +ECConsumer = Literal["ec_consumer"] +ECRole = Literal[ECProducer, ECConsumer] + + +@config +@dataclass +class ECTransferConfig: + """Configuration for distributed EC cache transfer.""" + + ec_connector: str | None = None + """The EC connector for vLLM to transmit EC caches between vLLM instances. + """ + + engine_id: str | None = None + """The engine id for EC transfers.""" + + ec_buffer_device: str | None = "cuda" + """The device used by ec connector to buffer the EC cache. + Currently only support 'cuda'.""" + + ec_buffer_size: float = 1e9 + """The buffer size for TorchDistributedConnector. Measured in number of + bytes. Recommended value: 1e9 (about 1GB).""" + + ec_role: ECRole | None = None + """Whether this vLLM instance produces, consumes EC cache, or both. Choices + are 'ec_producer', 'ec_consumer'.""" + + ec_rank: int | None = None + """The rank of this vLLM instance in the EC cache transfer. Typical value: + 0 for encoder, 1 for pd instance. + Currently only 1P1D is supported.""" + + ec_parallel_size: int = 1 + """The number of parallel instances for EC cache transfer. For + PyNcclConnector, this should be 2.""" + + ec_ip: str = "127.0.0.1" + """The EC connector ip, used to build distributed connection.""" + + ec_port: int = 14579 + """The EC connector port, used to build distributed connection.""" + + ec_connector_extra_config: dict[str, Any] = field(default_factory=dict) + """any extra config that the connector may need.""" + + ec_connector_module_path: str | None = None + """The Python module path to dynamically load the EC connector from. + Only supported in V1.""" + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: list[Any] = [] + hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() + return hash_str + + def __post_init__(self) -> None: + if self.engine_id is None: + self.engine_id = str(uuid.uuid4()) + + if self.ec_role is not None and self.ec_role not in get_args(ECRole): + raise ValueError( + f"Unsupported ec_role: {self.ec_role}. " + f"Supported roles are {get_args(ECRole)}" + ) + + if self.ec_connector is not None and self.ec_role is None: + raise ValueError( + "Please specify ec_role when ec_connector " + f"is set, supported roles are {get_args(ECRole)}" + ) + + @property + def is_ec_transfer_instance(self) -> bool: + return self.ec_connector is not None and self.ec_role in get_args(ECRole) + + @property + def is_ec_producer(self) -> bool: + return self.ec_connector is not None and self.ec_role in get_args(ECProducer) + + @property + def is_ec_consumer(self) -> bool: + return self.ec_connector is not None and self.ec_role in get_args(ECConsumer) + + def get_from_extra_config(self, key, default) -> Any: + return self.ec_connector_extra_config.get(key, default) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index b15d122c9161..5eb41132dda3 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -24,6 +24,7 @@ from .cache import CacheConfig from .compilation import CompilationConfig, CompilationLevel, CUDAGraphMode from .device import DeviceConfig +from .ec_transfer import ECTransferConfig from .kv_events import KVEventsConfig from .kv_transfer import KVTransferConfig from .load import LoadConfig @@ -100,6 +101,8 @@ class VllmConfig: """The configurations for distributed KV cache transfer.""" kv_events_config: KVEventsConfig | None = None """The configurations for event publishing.""" + ec_transfer_config: ECTransferConfig | None = None + """The configurations for distributed EC cache transfer.""" # some opaque config, only used to provide additional information # for the hash computation, mainly used for testing, debugging or out of # tree config registration. @@ -184,6 +187,10 @@ def compute_hash(self) -> str: vllm_factors.append(self.kv_transfer_config.compute_hash()) else: vllm_factors.append("None") + if self.ec_transfer_config: + vllm_factors.append(self.ec_transfer_config.compute_hash()) + else: + vllm_factors.append("None") if self.additional_config: if isinstance(additional_config := self.additional_config, dict): additional_config_hash = hashlib.md5( diff --git a/vllm/distributed/ec_transfer/__init__.py b/vllm/distributed/ec_transfer/__init__.py new file mode 100644 index 000000000000..0decfd143e34 --- /dev/null +++ b/vllm/distributed/ec_transfer/__init__.py @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm.distributed.ec_transfer.ec_transfer_state import ( + ensure_ec_transfer_initialized, + get_ec_transfer, + has_ec_transfer, +) + +__all__ = [ + "get_ec_transfer", + "ensure_ec_transfer_initialized", + "has_ec_transfer", +] diff --git a/vllm/distributed/ec_transfer/ec_connector/__init__.py b/vllm/distributed/ec_transfer/ec_connector/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/vllm/distributed/ec_transfer/ec_connector/base.py b/vllm/distributed/ec_transfer/ec_connector/base.py new file mode 100644 index 000000000000..aa082539ae61 --- /dev/null +++ b/vllm/distributed/ec_transfer/ec_connector/base.py @@ -0,0 +1,247 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +ECConnectorBase Class for Distributed Encoder Cache & +P2P Encoder cache communication in V1 + +The class provides the following primitives: + Scheduler-side: runs in the scheduler, binds metadata, which + is used by the worker-side to load/save Encoder cache. + check_caches_exist() - Check whether Encoder cache of requests exist + update_state_after_alloc() - update ECConnector state after + allocate. This will decide to load the cache or not + request_finished() - called when a request is finished, + free the cache with the requests + + Worker-side: runs in each worker, loads/saves Encoder Cache to/from + the Connector based on the metadata. + start_load_ec() - starts loading all ECs (maybe async) + wait_for_save() - blocks until all saves are done + + get_finished() - called with ids of finished requests, returns + ids of requests that have completed async sending/recving. +""" + +import enum +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any + +import torch + +from vllm.logger import init_logger +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.outputs import ECConnectorOutput + +if TYPE_CHECKING: + from vllm.config import VllmConfig + from vllm.v1.request import Request + +logger = init_logger(__name__) + + +class ECConnectorRole(enum.Enum): + # Connector running in the scheduler process + SCHEDULER = 0 + + # Connector running in the worker process + WORKER = 1 + + +class ECConnectorMetadata(ABC): # noqa: B024 + """ + Abstract Metadata used to communicate between the + Scheduler ECConnector and Worker ECConnector. + """ + + pass + + +class ECConnectorBase(ABC): + def __init__(self, vllm_config: "VllmConfig", role: ECConnectorRole): + self._connector_metadata: ECConnectorMetadata | None = None + self._vllm_config = vllm_config + self._role = role + if vllm_config.ec_transfer_config is not None: + self._is_producer = vllm_config.ec_transfer_config.is_ec_producer + else: + raise ValueError("ec_transfer_config must be set for ECConnectorBase") + + @property + def role(self) -> ECConnectorRole: + return self._role + + @property + def is_producer(self) -> bool: + return self._is_producer + + # ============================== + # Worker-side methods + # ============================== + + def bind_connector_metadata(self, connector_metadata: ECConnectorMetadata) -> None: + """Set the connector metadata from the scheduler. + + This function should be called by the model runner every time + before the model execution. The metadata will be used for runtime + EC cache loading. + + Args: + connector_metadata (dict): the connector metadata. + """ + self._connector_metadata = connector_metadata + + def clear_connector_metadata(self) -> None: + """Clear the connector metadata. + + This function should be called by the model runner every time + after the model execution. + """ + self._connector_metadata = None + + def _get_connector_metadata(self) -> ECConnectorMetadata: + """Get the connector metadata. + + This function should only be called inside the connector. + + Returns: + ConnectorMetadata: the connector metadata. + """ + + # Should only be called while set to valid metadata. + assert self._connector_metadata is not None + return self._connector_metadata + + def register_caches( + self, + ec_caches: dict[str, torch.Tensor], + ): + """ + Initialize with the EC caches. + Args: + ec_caches: dictionary of encoder cache + """ + # TODO: Implement this later for P2P feature + return + + @abstractmethod + def start_load_caches( + self, encoder_cache: dict[str, torch.Tensor], **kwargs + ) -> None: + """ + Start loading the cache from the connector into vLLM's encoder cache. + + This method loads the encoder cache based on metadata provided by the scheduler. + It is called before `_gather_mm_embeddings` for the EC Connector. For EC, + the `encoder_cache` and `mm_hash` are stored in `kwargs`. + + Args: + encoder_cache (dict[str, torch.Tensor]): A dictionary mapping multimodal + data hashes (`mm_hash`) to encoder cache tensors. + kwargs (dict): Additional keyword arguments for the connector. + """ + pass + + @abstractmethod + def save_caches( + self, encoder_cache: dict[str, torch.Tensor], mm_hash: str, **kwargs + ) -> None: + """ + Save the encoder cache to the connector. + + This method saves the encoder cache from the worker's local storage + to shared storage or another external connector. + + Args: + encoder_cache (dict[str, torch.Tensor]): A dictionary mapping multimodal + data hashes (`mm_hash`) to encoder cache tensors. + mm_hash (str): The hash of the multimodal data whose cache is being saved. + kwargs (dict): Additional keyword arguments for the connector. + """ + pass + + def get_finished( + self, finished_req_ids: set[str] + ) -> tuple[set[str] | None, set[str] | None]: + """ + Notifies worker-side connector ids of requests that have + finished generating tokens on the worker. + The scheduler process (via the Executors) will use this output + to track which workers are done. + + Returns: + ids of requests that have finished asynchronous transfer + (requests that previously returned True from request_finished()), + tuple of (sending/saving ids, recving/loading ids). + The finished saves/sends req ids must belong to a set provided in a + call to this method (this call or a prior one). + """ + return None, None + + # ============================== + # Scheduler-side methods + # ============================== + + @abstractmethod + def has_caches( + self, + request: "Request", + ) -> list[bool]: + """ + Check if encoder cache exists for each mm data of requests + + Args: + request (Request): the request object. + + Returns: + A list bool where ith value is True if cache exist for + ith mm_data of requests + """ + pass + + @abstractmethod + def update_state_after_alloc(self, request: "Request", index: int): + """ + Update ECConnector state to decide allocate cache for requests + + Args: + request (Request): the request object. + """ + pass + + @abstractmethod + def build_connector_meta( + self, scheduler_output: SchedulerOutput + ) -> ECConnectorMetadata: + """ + Build the connector metadata for this step. + + This function should NOT modify fields in the scheduler_output. + Also, calling this function will reset the state of the connector. + + Args: + scheduler_output (SchedulerOutput): the scheduler output object. + """ + pass + + def update_connector_output(self, connector_output: ECConnectorOutput): + """ + Update ECConnector state from worker-side connectors output. + + Args: + connector_output (ECConnectorOutput): the worker-side + connectors output. + """ + return + + def request_finished( + self, request: "Request" + ) -> tuple[bool, dict[str, Any] | None]: + """ + Called when a request has finished, before its encoder cache is freed. + + Returns: + True if the request is being saved/sent asynchronously and cached + should not be freed until the request_id is returned from + get_finished(). + """ + return False, None diff --git a/vllm/distributed/ec_transfer/ec_connector/factory.py b/vllm/distributed/ec_transfer/ec_connector/factory.py new file mode 100644 index 000000000000..bfdf51d775bd --- /dev/null +++ b/vllm/distributed/ec_transfer/ec_connector/factory.py @@ -0,0 +1,88 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import importlib +from collections.abc import Callable +from typing import TYPE_CHECKING + +# yapf: disable +from vllm.distributed.ec_transfer.ec_connector.base import ( + ECConnectorBase, + ECConnectorRole, +) +from vllm.logger import init_logger + +# yapf: enable + +if TYPE_CHECKING: + from vllm.config import ECTransferConfig, VllmConfig + +logger = init_logger(__name__) + + +class ECConnectorFactory: + _registry: dict[str, Callable[[], type[ECConnectorBase]]] = {} + + @classmethod + def register_connector(cls, name: str, module_path: str, class_name: str) -> None: + """Register a connector with a lazy-loading module and class name.""" + if name in cls._registry: + raise ValueError(f"Connector '{name}' is already registered.") + + def loader() -> type[ECConnectorBase]: + module = importlib.import_module(module_path) + return getattr(module, class_name) + + cls._registry[name] = loader + + @classmethod + def create_connector( + cls, + config: "VllmConfig", + role: ECConnectorRole, + ) -> ECConnectorBase: + ec_transfer_config = config.ec_transfer_config + if ec_transfer_config is None: + raise ValueError("ec_transfer_config must be set to create a connector") + connector_cls = cls.get_connector_class(ec_transfer_config) + logger.info( + "Creating connector with name: %s and engine_id: %s", + connector_cls.__name__, + ec_transfer_config.engine_id, + ) + # Connector is explicitly separated into two roles. + # Scheduler connector: + # - Co-locate with scheduler process + # - Should only be used inside the Scheduler class + # Worker connector: + # - Co-locate with worker process + return connector_cls(config, role) + + @classmethod + def get_connector_class( + cls, ec_transfer_config: "ECTransferConfig" + ) -> type[ECConnectorBase]: + """Get the connector class by name.""" + connector_name = ec_transfer_config.ec_connector + if connector_name is None: + raise ValueError("EC connect must not be None") + elif connector_name in cls._registry: + connector_cls = cls._registry[connector_name]() + else: + connector_module_path = ec_transfer_config.ec_connector_module_path + if connector_module_path is None: + raise ValueError(f"Unsupported connector type: {connector_name}") + connector_module = importlib.import_module(connector_module_path) + connector_cls = getattr(connector_module, connector_name) + return connector_cls + + +# Register various connectors here. +# The registration should not be done in each individual file, as we want to +# only load the files corresponding to the current connector. + +ECConnectorFactory.register_connector( + "ECSharedStorageConnector", + "vllm.distributed.ec_transfer.ec_connector.shared_storage_connector", + "ECSharedStorageConnector", +) diff --git a/vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py b/vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py new file mode 100644 index 000000000000..0c2dcb1e72f0 --- /dev/null +++ b/vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py @@ -0,0 +1,201 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os +from dataclasses import dataclass +from typing import TYPE_CHECKING + +import safetensors + +from vllm.config import VllmConfig +from vllm.distributed.ec_transfer.ec_connector.base import ( + ECConnectorBase, + ECConnectorMetadata, + ECConnectorRole, +) +from vllm.logger import init_logger +from vllm.v1.core.sched.output import SchedulerOutput + +if TYPE_CHECKING: + from vllm.v1.request import Request + +logger = init_logger(__name__) + + +@dataclass +class MMMeta: + mm_hash: str + num_token: int + + @staticmethod + def make_meta(mm_hash, num_token) -> "MMMeta": + return MMMeta(mm_hash=mm_hash, num_token=num_token) + + +@dataclass +class ECSharedStorageConnectorMetadata(ECConnectorMetadata): + mm_datas: list[MMMeta] + + def __init__(self): + self.mm_datas = [] + + def add_mm_data(self, mm_data: MMMeta): + self.mm_datas.append(mm_data) + + +class ECSharedStorageConnector(ECConnectorBase): + # NOTE: This is Simple debug implementation of the EC connector. + # It save / load the EC cache to / from the disk. + + def __init__(self, vllm_config: "VllmConfig", role: ECConnectorRole): + super().__init__(vllm_config=vllm_config, role=role) + # req_id -> index + self._mm_datas_need_loads: dict[str, int] = {} + transfer_config = vllm_config.ec_transfer_config + if transfer_config is not None: + self._storage_path = transfer_config.get_from_extra_config( + "shared_storage_path", "/tmp" + ) + logger.debug(transfer_config) + logger.debug("Shared storage path is %s", self._storage_path) + else: + raise ValueError("ec_transfer_config must be set for ECConnectorBase") + + def start_load_caches(self, encoder_cache, **kwargs) -> None: + """ + Start loading the cache from the connector into vLLM's encoder cache. + + This method loads the encoder cache based on metadata provided by the scheduler. + It is called before `_gather_mm_embeddings` for the EC Connector. For EC, + the `encoder_cache` and `mm_hash` are stored in `kwargs`. + + Args: + encoder_cache (dict[str, torch.Tensor]): A dictionary mapping multimodal + data hashes (`mm_hash`) to encoder cache tensors. + kwargs (dict): Additional keyword arguments for the connector. + """ + + # Get the metadata + metadata: ECConnectorMetadata = self._get_connector_metadata() + assert isinstance(metadata, ECSharedStorageConnectorMetadata) + assert encoder_cache is not None + if metadata is None: + logger.warning( + ( + "In connector.start_load_caches, ", + "but the connector metadata is None", + ) + ) + return + # Load the EC for each mm data + for mm_data in metadata.mm_datas: + if mm_data.mm_hash in encoder_cache: + continue + filename = self._generate_filename_debug(mm_data.mm_hash) + ec_cache = safetensors.torch.load_file(filename)["ec_cache"].cuda() + encoder_cache[mm_data.mm_hash] = ec_cache + logger.debug("Success load encoder cache for hash %s", mm_data.mm_hash) + + def save_caches(self, encoder_cache, mm_hash, **kwargs) -> None: + """ + Save the encoder cache to the connector. + + This method saves the encoder cache from the worker's local storage + to shared storage or another external connector. + + Args: + encoder_cache (dict[str, torch.Tensor]): A dictionary mapping multimodal + data hashes (`mm_hash`) to encoder cache tensors. + mm_hash (str): The hash of the multimodal data whose cache is being saved. + kwargs (dict): Additional keyword arguments for the connector. + """ + # Return if it is PD Instance + if not self.is_producer: + return + filename = self._generate_filename_debug(mm_hash) + ec_cache = encoder_cache[mm_hash] + tensors = {"ec_cache": ec_cache.detach().cpu()} + safetensors.torch.save_file(tensors, filename) + logger.debug("Save cache successful for mm_hash %s", mm_hash) + + def has_caches( + self, + request: "Request", + ) -> list[bool]: + """ + Check if cache exist externally for each mm_data of request + + Args: + request (Request): the request object. + + Returns: + List of bool indicate that ith mm_data exist in cache or not + """ + result = [] + for feature in request.mm_features: + result.append(self._found_match_for_mm_data(feature.identifier)) + return result + + def update_state_after_alloc( + self, + request: "Request", + index: int, + ) -> None: + """ + Update ECConnector state after encoder cache allocation. + """ + mm_hash = request.mm_features[index].identifier + num_encoder_token = request.get_num_encoder_tokens(index) + # Insert mm_hash only if this block has not been recorded yet. + self._mm_datas_need_loads[mm_hash] = num_encoder_token + + def build_connector_meta( + self, + scheduler_output: SchedulerOutput, + ) -> ECConnectorMetadata: + """Build the connector metadata for this step. + + This function should NOT modify any fields in the scheduler_output. + Also, calling this function will reset the state of the connector. + This only build for load mm_data only + Args: + scheduler_output (SchedulerOutput): the scheduler output object. + """ + meta = ECSharedStorageConnectorMetadata() + for mm_hash, num_encoder_token in self._mm_datas_need_loads.items(): + meta.add_mm_data(MMMeta.make_meta(mm_hash, num_encoder_token)) + self._mm_datas_need_loads.clear() + return meta + + # ============================== + # Helper functions + # ============================== + + def _found_match_for_mm_data(self, mm_hash) -> bool: + """Check if the cache is hit for the request.""" + filename = self._generate_filename_debug(mm_hash) + return os.path.exists(filename) + + def _generate_foldername_debug( + self, + mm_hash: str, + create_folder: bool = True, # <- now defaults to True + ) -> str: + """ + Return the folder in which the cache for this mm_hash lives. + If `create_folder` is True (default) the directory is created + recursively the first time it is needed. + """ + foldername = os.path.join(self._storage_path, mm_hash) + if create_folder: + os.makedirs(foldername, exist_ok=True) + return foldername + + def _generate_filename_debug(self, mm_hash: str) -> str: + """ + Return the full path of the safetensors file for this mm_hash. + Ensures the parent directory exists because + `_generate_foldername_debug` is called with its default + (`create_folder=True`). + """ + foldername = self._generate_foldername_debug(mm_hash) # <- folder auto-created + return os.path.join(foldername, "encoder_cache.safetensors") diff --git a/vllm/distributed/ec_transfer/ec_transfer_state.py b/vllm/distributed/ec_transfer/ec_transfer_state.py new file mode 100644 index 000000000000..95f516129e0c --- /dev/null +++ b/vllm/distributed/ec_transfer/ec_transfer_state.py @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import TYPE_CHECKING + +from vllm import envs +from vllm.distributed.ec_transfer.ec_connector.base import ( + ECConnectorBase, + ECConnectorRole, +) +from vllm.distributed.ec_transfer.ec_connector.factory import ECConnectorFactory + +if TYPE_CHECKING: + from vllm.config import VllmConfig + +_EC_CONNECTOR_AGENT: ECConnectorBase | None = None + + +def get_ec_transfer() -> ECConnectorBase: + assert _EC_CONNECTOR_AGENT is not None, "disaggregated EC cache is not initialized" + return _EC_CONNECTOR_AGENT + + +def has_ec_transfer() -> bool: + return _EC_CONNECTOR_AGENT is not None + + +def ensure_ec_transfer_initialized(vllm_config: "VllmConfig") -> None: + """ + Initialize EC cache connector. + """ + + global _EC_CONNECTOR_AGENT + + if vllm_config.ec_transfer_config is None: + return + + if ( + vllm_config.ec_transfer_config.is_ec_transfer_instance + and _EC_CONNECTOR_AGENT is None + ): + if envs.VLLM_USE_V1: + _EC_CONNECTOR_AGENT = ECConnectorFactory.create_connector( + config=vllm_config, role=ECConnectorRole.WORKER + ) + else: + raise ValueError("V0 is no longer supported") diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 54a0539f4047..6653a4709436 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -37,6 +37,7 @@ CompilationConfig, ConfigType, DeviceConfig, + ECTransferConfig, EPLBConfig, KVEventsConfig, KVTransferConfig, @@ -498,6 +499,8 @@ class EngineArgs: kv_transfer_config: KVTransferConfig | None = None kv_events_config: KVEventsConfig | None = None + ec_transfer_config: ECTransferConfig | None = None + generation_config: str = ModelConfig.generation_config enable_sleep_mode: bool = ModelConfig.enable_sleep_mode override_generation_config: dict[str, Any] = get_field( @@ -1026,6 +1029,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "--kv-transfer-config", **vllm_kwargs["kv_transfer_config"] ) vllm_group.add_argument("--kv-events-config", **vllm_kwargs["kv_events_config"]) + vllm_group.add_argument( + "--ec-transfer-config", **vllm_kwargs["ec_transfer_config"] + ) vllm_group.add_argument( "--compilation-config", "-O", **vllm_kwargs["compilation_config"] ) @@ -1577,6 +1583,7 @@ def create_engine_config( compilation_config=self.compilation_config, kv_transfer_config=self.kv_transfer_config, kv_events_config=self.kv_events_config, + ec_transfer_config=self.ec_transfer_config, additional_config=self.additional_config, ) diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py index 23227065ee95..b7a59d906414 100644 --- a/vllm/model_executor/warmup/kernel_warmup.py +++ b/vllm/model_executor/warmup/kernel_warmup.py @@ -49,10 +49,18 @@ def _is_flashinfer_backend(backend): except NotImplementedError: return False - if not worker.model_runner.is_pooling_model and all( - _is_flashinfer_backend(group.backend) - for groups in worker.model_runner.attn_groups - for group in groups + # NOTE: we add check for empty attn_groups to avoid errors when + # deploying models such as E instances and encoder-only models. + # As for those models, worker.model_runner.attn_groups is empty. + # This change is made during EPD feature development. + if ( + not worker.model_runner.is_pooling_model + and worker.model_runner.attn_groups + and all( + _is_flashinfer_backend(group.backend) + for groups in worker.model_runner.attn_groups + for group in groups + ) ): logger.info("Warming up FlashInfer attention.") # Warmup with mixed batch containing both prefill and decode tokens diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index bce15e1a476f..af64a2b06675 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -11,6 +11,7 @@ import numpy.typing as npt import torch + from vllm.distributed.ec_transfer.ec_connector.base import ECConnectorMetadata from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata from vllm.lora.request import LoRARequest from vllm.multimodal.inputs import MultiModalFeatureSpec @@ -18,6 +19,7 @@ from vllm.sampling_params import SamplingParams from vllm.v1.request import Request else: + ECConnectorMetadata = object KVConnectorMetadata = object LoRARequest = object MultiModalFeatureSpec = object @@ -173,3 +175,6 @@ class SchedulerOutput: # KV Cache Connector metadata. kv_connector_metadata: KVConnectorMetadata | None = None + + # EC Cache Connector metadata + ec_connector_metadata: ECConnectorMetadata | None = None \ No newline at end of file diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 16808417766a..deaf4865f2a5 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -8,6 +8,8 @@ from typing import Any from vllm.config import VllmConfig +from vllm.distributed.ec_transfer.ec_connector.base import ECConnectorRole +from vllm.distributed.ec_transfer.ec_connector.factory import ECConnectorFactory from vllm.distributed.kv_events import EventPublisherFactory, KVEventBatch from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory from vllm.distributed.kv_transfer.kv_connector.v1 import ( @@ -96,7 +98,11 @@ def __init__( self.kv_events_config, self.parallel_config.data_parallel_rank, ) - + self.ec_connector = None + if self.vllm_config.ec_transfer_config is not None: + self.ec_connector = ECConnectorFactory.create_connector( + config=self.vllm_config, role=ECConnectorRole.SCHEDULER + ) num_gpu_blocks = self.cache_config.num_gpu_blocks assert num_gpu_blocks is not None and num_gpu_blocks > 0 @@ -220,12 +226,14 @@ def schedule(self) -> SchedulerOutput: # Schedule encoder inputs. encoder_inputs_to_schedule = None + external_load_encoder_input: list[int] = [] new_encoder_compute_budget = encoder_compute_budget if request.has_encoder_inputs: ( encoder_inputs_to_schedule, num_new_tokens, new_encoder_compute_budget, + external_load_encoder_input: list[int] = [] ) = self._try_schedule_encoder_inputs( request, request.num_computed_tokens, @@ -325,6 +333,11 @@ def schedule(self) -> SchedulerOutput: for i in encoder_inputs_to_schedule: self.encoder_cache_manager.allocate(request, i) encoder_compute_budget = new_encoder_compute_budget + if external_load_encoder_input: + for i in external_load_encoder_input: + self.encoder_cache_manager.allocate(request, i) + if self.ec_connector is not None: + self.ec_connector.update_state_after_alloc(request, i) # Record the LoRAs in scheduled_running_reqs scheduled_loras: set[int] = set() @@ -428,6 +441,7 @@ def schedule(self) -> SchedulerOutput: num_computed_tokens = request.num_computed_tokens encoder_inputs_to_schedule = None + external_load_encoder_input = [] new_encoder_compute_budget = encoder_compute_budget # KVTransfer: loading remote KV, do not allocate for new work. @@ -468,6 +482,7 @@ def schedule(self) -> SchedulerOutput: encoder_inputs_to_schedule, num_new_tokens, new_encoder_compute_budget, + external_load_encoder_input, ) = self._try_schedule_encoder_inputs( request, num_computed_tokens, @@ -568,7 +583,12 @@ def schedule(self) -> SchedulerOutput: for i in encoder_inputs_to_schedule: self.encoder_cache_manager.allocate(request, i) encoder_compute_budget = new_encoder_compute_budget - + # Allocate for external load encoder cache + if external_load_encoder_input: + for i in external_load_encoder_input: + self.encoder_cache_manager.allocate(request, i) + if self.ec_connector is not None: + self.ec_connector.update_state_after_alloc(request, i) # Put back any skipped requests at the head of the waiting queue if skipped_waiting_requests: self.waiting.prepend_requests(skipped_waiting_requests) @@ -576,6 +596,7 @@ def schedule(self) -> SchedulerOutput: # Check if the scheduling constraints are satisfied. total_num_scheduled_tokens = sum(num_scheduled_tokens.values()) assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens + assert token_budget >= 0 assert len(self.running) <= self.max_num_running_reqs # Since some requests in the RUNNING queue may not be scheduled in @@ -658,6 +679,11 @@ def schedule(self) -> SchedulerOutput: if events: batch = KVEventBatch(ts=time.time(), events=events) self.kv_event_publisher.publish(batch) + + if self.ec_connector is not None: + meta = self.ec_connector.build_connector_meta(scheduler_output) + scheduler_output.ec_connector_metadata = meta + self._update_after_schedule(scheduler_output) return scheduler_output @@ -758,7 +784,7 @@ def _try_schedule_encoder_inputs( num_computed_tokens: int, num_new_tokens: int, encoder_compute_budget: int, - ) -> tuple[list[int], int, int]: + ) -> tuple[list[int], int, int, list[int]]: """ Determine which encoder inputs need to be scheduled in the current step, and update `num_new_tokens` and encoder token budget accordingly. @@ -767,7 +793,7 @@ def _try_schedule_encoder_inputs( - Its output tokens overlap with the range of tokens being computed in this step, i.e., [num_computed_tokens, num_computed_tokens + num_new_tokens). - - It is not already computed and stored in the encoder cache. + - It is not exist on remote encoder cache (via ECConnector) - There is sufficient encoder token budget to process it. - The encoder cache has space to store it. @@ -779,12 +805,16 @@ def _try_schedule_encoder_inputs( blocks and externally cached blocks (via KVConnector). """ if num_new_tokens == 0 or not request.has_encoder_inputs: - return [], num_new_tokens, encoder_compute_budget + return [], num_new_tokens, encoder_compute_budget, [] encoder_inputs_to_schedule: list[int] = [] mm_features = request.mm_features assert mm_features is not None assert len(mm_features) > 0 + external_load_encoder_input = [] + # Check remote cache first + if self.ec_connector is not None: + remote_cache_has_item = self.ec_connector.has_caches(request) # NOTE: since scheduler operates on the request level (possibly with # multiple encoder inputs per request), we need to create temporary # trackers for accounting at the encoder input level. @@ -865,6 +895,11 @@ def _try_schedule_encoder_inputs( num_new_tokens = 0 break + if self.ec_connector is not None and remote_cache_has_item[i]: + mm_hashes_to_schedule.add(request.mm_features[i].identifier) + external_load_encoder_input.append(i) + continue + num_tokens_to_schedule += num_encoder_tokens encoder_compute_budget -= num_encoder_tokens mm_hashes_to_schedule.add(request.mm_features[i].identifier) @@ -874,6 +909,7 @@ def _try_schedule_encoder_inputs( encoder_inputs_to_schedule, num_new_tokens, encoder_compute_budget, + external_load_encoder_input, ) def get_grammar_bitmask( diff --git a/vllm/v1/worker/ec_connector_model_runner_mixin.py b/vllm/v1/worker/ec_connector_model_runner_mixin.py new file mode 100644 index 000000000000..00bc909df297 --- /dev/null +++ b/vllm/v1/worker/ec_connector_model_runner_mixin.py @@ -0,0 +1,87 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Define EC connector functionality mixin for model runners. +""" + +from collections.abc import Generator +from contextlib import AbstractContextManager, contextmanager, nullcontext +from typing import ( + TYPE_CHECKING, # noqa: UP035 +) + +import torch + +from vllm.distributed.ec_transfer import get_ec_transfer, has_ec_transfer +from vllm.distributed.ec_transfer.ec_connector.base import ECConnectorBase +from vllm.logger import init_logger +from vllm.v1.outputs import ECConnectorOutput + +if TYPE_CHECKING: + from vllm.v1.core.sched.output import SchedulerOutput + +logger = init_logger(__name__) + + +# Defined as a EC connector functionality mixin for ModelRunner (GPU, TPU) +class ECConnectorModelRunnerMixin: + @staticmethod + def maybe_save_ec_to_connector( + encoder_cache: dict[str, torch.Tensor], + mm_hash: str, + ): + if not has_ec_transfer(): + logger.debug("Not have ec transfer please check") + return + connector = get_ec_transfer() + connector.save_caches(encoder_cache=encoder_cache, mm_hash=mm_hash) + + @staticmethod + def get_finished_ec_transfers( + scheduler_output: "SchedulerOutput", + ) -> tuple[set[str] | None, set[str] | None]: + if has_ec_transfer(): + return get_ec_transfer().get_finished(scheduler_output.finished_req_ids) + return None, None + + @staticmethod + def maybe_get_ec_connector_output( + scheduler_output: "SchedulerOutput", + encoder_cache: dict[str, torch.Tensor], + **kwargs, + ) -> AbstractContextManager[ECConnectorOutput | None]: + return ( + ECConnectorModelRunnerMixin._get_ec_connector_output( + scheduler_output, encoder_cache, **kwargs + ) + if has_ec_transfer() + else nullcontext() + ) + + # This context manager must be used within an active forward context. + # It encapsulates the entire EC conector lifecycle within execute_model + @staticmethod + @contextmanager + def _get_ec_connector_output( + scheduler_output: "SchedulerOutput", + encoder_cache: dict[str, torch.Tensor], + **kwargs, + ) -> Generator[ECConnectorOutput, None, None]: + output = ECConnectorOutput() + + ec_connector = get_ec_transfer() + assert isinstance(ec_connector, ECConnectorBase) + assert scheduler_output.ec_connector_metadata is not None + ec_connector.bind_connector_metadata(scheduler_output.ec_connector_metadata) + + if not ec_connector.is_producer: + ec_connector.start_load_caches(encoder_cache, **kwargs) + + try: + yield output + finally: + output.finished_sending, output.finished_recving = ( + ec_connector.get_finished(scheduler_output.finished_req_ids) + ) + + ec_connector.clear_connector_metadata()