From 2d3edbc34232f803d0bfd8e2ec489edb72fb8a1c Mon Sep 17 00:00:00 2001 From: ningwenchang Date: Thu, 25 Sep 2025 08:30:11 +0000 Subject: [PATCH 1/2] fix: Handle encoding errors in selectolax by switching to BeautifulSoup --- .../main_html_parser/simplify_html/simplify_html.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py index 3c37da8e..05b8352e 100644 --- a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py +++ b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py @@ -3,6 +3,7 @@ import uuid from typing import Dict, List, Tuple +from bs4 import BeautifulSoup from lxml import etree, html from selectolax.parser import HTMLParser @@ -858,8 +859,12 @@ def simplify_html(html_str) -> etree.Element: _xpath_mapping: xpath映射 """ # 使用selectolax的HTMLParser来修复html - soup = HTMLParser(html_str) - fixed_html = soup.html + try: + soup = HTMLParser(html_str) + fixed_html = soup.html + except Exception: + soup = BeautifulSoup(html_str, 'html.parser') + fixed_html = str(soup) preprocessed_html = remove_xml_declaration(fixed_html) # 注释通过lxml的HTMLParser的remove_comments参数处理 From adb2185a49178fbcb8f934e6019cb95f4d684432 Mon Sep 17 00:00:00 2001 From: ningwenchang Date: Tue, 14 Oct 2025 19:30:01 +0800 Subject: [PATCH 2/2] [fix]: add unittest case for unusual encoding html --- .../simplify_cases/unusual_encoding.html | Bin 0 -> 27980 bytes .../parser/test_tag_simplifier.py | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+) create mode 100644 tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/unusual_encoding.html diff --git a/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/unusual_encoding.html b/tests/llm_web_kit/main_html_parser/parser/assets/test_html_data/simplify_cases/unusual_encoding.html new file mode 100644 index 0000000000000000000000000000000000000000..969074b3d98553b2cb117f22e3e1e92648000413 GIT binary patch literal 27980 zcmeHwd2{1H#XE%^Q!G8rdsWqR74>E+Iyxp&QrZ^Cl3Ge( zT>U!2#(2jIc*7fBz{Uo=0LC_6zrp(u)5`6y{tEAR?#)c4WE%@GT}<~lsFHN^uIHX} z?%D3i^PgS%?Zs<9UcD^T%MDTC>JJybzH(7;dA#0#E4k?PUb=Qk`1jvj`{rxGU+fdE zCB;}OB1hseG3xbR{?;Y9>SZ}`&g*S#Y%FdpDUK)Wyw|?--lD$z)Y$y$ksU3I!*bYF zbN(}r=l79X;mT#9Quss7c{Y3~DyCA^u88CkZZ$+>_kNafpv!XuWFYp-IBco zg)fCry_if%a&=uWP~s1`1g|~&E0;0h{2tH#yXS|mJQw2)2~my&qtaQs)3aADS4-hK z>5EW(GTtCn`_G~$2TbIM9F=N*^?9JQG*I^CuRbsF`zn3(S?;ear_V}XIh6-W{S{U8 z=`XFU#3%NE{Vpl5C@J~!yqEXGCl(VikSi>uLdi&iaK*944;@Lny`9-^*ZP)}m5t2y zGo^o2XWUtaQro4HjcWk4fw((MXHm2Npsb@4X!wA`iawVd-WhBzf46 zFN8=8R4+z7sgM{27JVT!h_@mQ=?1HSle2Ik9+y)f^#nuwXe4$+NJ`OaS4wV*N~wBD z!ZKMWo_Tyuq7om+}U!zKdBiPH@-ir3NgKy8j46vfET&HKvH~_19X?T+ z9R+j$e*n6MfgJOU9e{|YY71t{|gLu5Y+B{5kWxrH7ao$@-KNGwJ`I3H=K)RA)b(8c$5W|*W194IQBT0mSr#wLN#>6fYKmh zapFxID%S6s^XaGsDQb&eqmi2PkkZ&$X=-D`G0t zDyL)_o#ET@+tug$+R&Ub z*y2t^VnxTaDl4xD){3FZs>(3X!Wg@g+1WU`m9L__LMjiJ*9OBujMe_a=-zbSiiq)| zliT_O;VLu?i2f>!sJE)a6IklGN#WeQ7ScFmF1_G2dIv z_L1=l?3!K3H<4|uWABSCY{|zW(g4!LP>x3>j~J41h!f@8d+w7eL@8KVRa#kwJui?N z=p6D|2x|5!bLu#W41hm^6PJsvc^_J+^?)tt&{HQB^M|~|0-wnV2&&L?gzs?(NXdMd zK{pAC#0NZBkPLt`j^N-Gb_JI}#6wswghg3&=yj0tsPA;b#h;|2aanALr(?tx%(4bC zB}>Uv0=f|xhtCJj!FoxFH>HG>tc@od#8?R8A&z%|r!bbphD~|>#n%(yN^gfdX;Qh% z3W-cd5mL2L9L}QbVXBOWb(%D%cMFp(Gru=gy8;#E*1Qj|L>7}+uw!i!bV5>4uf`jt zWJxi8N>^gPP0OyD@1;gO5k%p9Fj;d!k^vgw>qxC+qQwdGq@5xy@@+FMtF$3}>`cq- zT{wQG<-RiEn{-r;)TUz~9a8^=%TXyLC!tr+?7tReh-&77z7a$9K%uuTgMk`nay%OK zfPf(Hk*tZ6F9?hYK`mx@0l=44Iv`(FRzkFC@opkN4^S*6aVe2b2XJK0B~0dg7)@b` z2*;th$dGMA)<*t~ku~t4$QtCxno0*x#@CVnObDFLj>Up3Egp{rUxmZM^>`EH^0#hn zs#ZQsNdgx|vis^JsFKQ)Cs+uY3Qh5JQi#4M3B7fSW&X;OU>GO_hTlq!f}Dg=N{H7A zm#+LM)QSy}C}dk8Wh9goYvs4X_Gcs;#%IFuMkrn!m;S~iQ}UrCWAgVrk}1ieWJ#69 zEy3hV%1?*M|5{AdlL}1=?Qb9?3%^T?(TLpi4xHXe;g0DB{Eq3C7C_^LkkG$LQ5aq% z&^XtMYPbquvUXDo`V>%CZ<&4kRU$?$7qL0jS~H~vcO0!8E+KupGTzHXzA$j7)c4W^Xo-D4Pg%nm5$b|cl>73Shzev zx*1=MqY?=YQHg6_9^xF7A1Yh7^Rz=_Vn-QUbpJ6@q)v9ZaVixVV*{jW z!3@;)999>#b3l6rTXbK*=l3aZ)|I{;ZEg*gl+0eQyHSc1;gPm6s2)sbwjUeKmIGKXV@L4LR-IA`n#d`W04z~osldQ9QviQGh_9c*&9}No0TU$e`&t04z}2!diKCH z9qF&0lG&Y4{NAoI^q3~TGOG==>#x_9u3l~XfJ!>1?dcoi-I?89b$CvFzOAoMlWl6^ zj?y=de#{la5WhLEbWa!^{o4Lx<<5Q7(HEDM+hgt^01rbPTnfhQc4q%TUnNfhV4+Q{ z1LTn+y<;6M2|nA~Zk*ypYV#9pD}m+ih`T;57Af6}XhoBHa?gE3DpI?f89>a(>gJ|C z(WBn&pc?)A8Q_oAaIK%~;udrOJqOrqIMAj(UeaIFrtuXOLyt_us$W4fpk ze<(DbJy3@`Gus4t{8ZXr=fIJ?`z|r47sIsA-A~;y;01_nYuOmPtE_YxQ^eLXTfJw_u+fakhidz6 z?a{FQv|a5U)w}oAmhrzdH=_{!one1qMf&U@O=@#qA7u#C^&0L%W&RDgAnaGP?_dZA z&zu{|_>DTWZ!8aCN6ad_o8U&^8R!gX0=!oHp15x??CRXm?`&y9llVuF>K_0CW5CSb zB+yvv>u^VJ7O8!U>cA}69e5qq4V$7eaF26<%x;T1ziclZn5}PucXz{Prne9g_TKG$~QVO^*;N=R{MWy2uB3^pQi!d zNKdUqALulz`#ZX~%p}=<91mmZh9FE{7~9kWdLDaN^RbL8(@RV(yXafgdRkOqzuu z7utj*_^IJGirgTKbZp>}sAk)?3WZi>+1sIYH5m`5xjJ(gV3xhvF{@JpSUy_8{~Aw5 zWa&S++Hz{>m2ZYQ8Nd9MZwDFdhtcq1VDSvawza8SibW4zs0)I#OyhTqWF@{ zN^mhTHyg<*Jqu>t{d1M-0-C97S%>ddUt-$Q43f9Fw+GY3;7 z`SkDG53Np=!CA-u|Bgq&{f7?VLNC=9JJp$c>cdf`y^R*QIIFa6LRUjfoBJo|De7pCx-rYeen(5~ z^&S1ulz#9KLLwew&p2{8^lZepajYt4JfZKck$66C~GhG z5gDUCf2pj|E}=wU=vQZU*^F#IxcYEXeL;#>=FPObNh;!<0D0A}HxSD8#oNlrin=|b zj@^dAM1Q#mjRa~K+cl6Q3EfcjM(z-MFkR~;&cnp`);BELL22HFY|nK@2=Le_i>yf_*j}K1 zpKWNJ%g}4onN46+At-UA4m~kJx+ASDPlr|}U>G5*2!@3Ep?AM9+767i`zUcL(qL~; zn7JaY--ez)4g{rbWn9#OCA396kcV4ivbzf=>ELvky+K;l1hbfMTZ5(WE{wQg14kee zsfYTD0c90Imc(5%dk1_xn3I&pGg{Xi{;A#F?jRyZ`e!*H^*f#FK)*3{U!Q$s%yrNl z`v)@n6VQ8z=|1dnU_GfuITi(Vcn#XcHzAad@2Zt4dJdy4l_W&;TSXfL_o8GJ&U!0E0K>Z3_t0Z}1j&_;`^@mjeSk)Axbj%twHFH2$rt6~u>^L0U0 z|0slu8w)ubUXX6WRTEQxg{6drTf#SUqGFN@DaLL60SBHtFz@e+(=Z30gSI^%9pxRM zw3!dKI_sC?T#Rd+r*Nx;9CMB_dGB<4g4nS$W)}eoFt=Q9LY&f(fx4l117Zt_#Ft-->D z;}2+yE0CUA;jE# z+LQYwJ|7jfJy9R@DVCc`>*&_F5wR1R%#MV(+`-3pcK9Z#VwYi}M> zz&X5fee-Z5pj@5W*EYMMwCPU|l+BmgPCs{wsr~oKZP>qRdl>K?S9sK}HMFuU;r2sg zVZ>bW?c9oX7tg8AO~Zx_vMpfMkPM~&%lfVW#uK66;-FfjFjzRIEcYQK)YI3 z2b@>NgCP)+)AP?6jNZBhZ#*nzM1aRI812Ejx<<{N+(o_fL>qzE?g;LpA9Ns|fm?YL z2X|2>9~sNH)hTj;I=Ktnq}tN9F;9+hCwBn@RfkqHI}fP*Fn2lV1vkK><5&xv#Ed=B zXopw}c%rg)htKg-V=aZtkxrqIX;vRcEc`HC;^)%m7-K_}&ObUo0|-2UpMjtG{k}j+ zIm9};Z)1vP?W+?7*@~^nMH{Q(Gs&c(^!~!_0bmwH6Xn4vR{N zRWXLf@W{!@V4j0r)0iY%tCN$d?FiFo)}9lGb8<46GchO(Z79dI)fY2dL{qz;sl$(0 zY(WdjKE0a=4p6o>v?smVnlzM0zpFUe2{s=;xiAaTkt|s$lK#344T76kj`N34ho$- zB(v8K#{^U#lskC{BBc-m$D_awOGDbqsPS}^yA_TThz_QdwjDT8Xz9;}V1I|rpByn0 z>&DOkJU%qLfmVcEQaQO(ru8R}cqrNtJg9p%vqOHZPmKZn%+eF5a+CG_1DUfRWfXMJ z<3p_HpFFE1Q41{VZ_g}!WmO-6SweNxwpTS4(t4O#!U3pn&2i!7IDt#KH;I4hY%4Je zhiwWmoXQq?jSmZ4P#x9hv&sZVY*o2jq;qpNGz zn*zR{MO`Z^36xSe?CAJ28o7~p?&d$o1Z|u$}VX|$1xDPY0~~06%I2BZ3n?M zh{MR?65M8_K4cP{gG(3_n+RypACjFbN2*$Wq>hjp=VTGe>Kk=*M(?7yBqx_(w{$ik zg`xZiE}?BdQCj*ldwqEP_aPODrAIp&Den~XRpzR8$Ak&k!ccDQcW^EWDSqMpMubAK z+3|WB4n|8>GZQB;b&SfcTFVoDosLFXUZ8wcjcKtq7=)Yt+_l+H_^rC5)3_jbvOx&L zyM;UXjZzSK*AVI`BA}&yq^CR@w0Yu5@Vkl(WGSSe zs70c+4+bAc$oGU+K`OxML5tsB1k8{x3$6~J3f(q65z%C13lb4I{(*>QlI{nQX=x$m zrPBkL0MiA6lhvMffXlq-dJ@o56oP_;kVx}#S|H1EyT&r#d!ky|-xzWLm@%`gt&b{Q zPnFK*6LIZFF&Pt5p-3u)Jb2(v4 z!Tk;!3VF~P#tO%b&0 zrk>+DEm2~}aPBJ2r61J_P>E}T=+g>J<=}7(lEr|$D~vfPAk-sy3U2Y#=cVEJ5M(GwDtixP_}@OijRxxA zI<7S1@=1aC0OKF*__!6p{dUmewVvBB`t#c=h$%r%;xlI`p8m`kmXN{TR5HSkyuq~q z{YkS`i2a9dytb{ZZdpY&$79QXMZC!68XtlU1xcOng$;rC#)kaOMwA|laU^0T4)OP} z<<3L)@M*!$N@o?@4shtnyWCu03VJeLM>-Gdhzh<+u*=;PDr%Q~&2AhB7jT;)Tk#@g zNer<{xkA9@+yq29XFP6|A9cpt7dLV-`Iu9_AajR#&y#f#f-YCEsl&mQj}s^IqIeEf zV>|e*@OcnLTvCQ` z7wYUXQX52~=%>80s;cB){DE@HhfYtBi$s}fJ}1x%7kF|c>j^G8vBVvCYOfJq7h~cj za(N)ZNM?H&k-u~u6kKC+)J=Isk*Q=^d%RD|-25`CAsSY&Se%8~xx~klPJYr?p>PGK zvJkb-TIdm1zTSnEw2ft!o=RWd1o9SHF#=Ya z=kYrh3Fymvgsu9s9(06Dk+|w-hg>@@>>+9tS+Xd1%=%MU_i;-Pmr-#68+Uz`mOY*- zxH$JJKDXfILM6|DXC4vnlyc3yAszznFE|~x_ooEMGycj17jhkEQ(?0oJo`bl3+4QM z3-KV5${v-Y){!CEg(S6{=yFN#qbFk{>~P$22vk-EO9R1Rr7sXV>pNRyqqc|}6UKv= z*dc9>-FO{DW}LhF`%c%tM6`E*_j+El~~{E;+VF! zhftL>XISPmTx`&vzC3e=(xc&GE}sCtU=CjxI}gG8fJf?Jw~M2VOY42fZvpp@aCEY8 zQ*BwrrR5^v)z6nzE>fFBei)W;U2rx0l<*gesf2*^^(-YWNUhox!|ZIH7z0<-nAOHQAG2AUqPVH#5tG0(JJk+Q3NxqP{+Io32TLUP z;EzB1PP!S7ruiF6vU$y|D_-V%4C=&&AZ5XjS�R)hi#8;+Hb_Pn2;R4d-xsm|7G~a3Q%U>Xe9?#3|E)xFq%U|3SlR_|2 z9gc_6q{|eC;HZ!#B-b6Z8Uwl1v_DgfU!Y_aEXF-Z0sW+D5><8Vjx+I`5A*DDdM#5> z^JS&7>jy!Zi%(r>x+d0r3)bs)aWJ^d1*M7;A~HJ4-^RmIaU=#un0!IPU2@4COnf1v f2v~nEa?=NJoxRzLHWhCr3O51!=@n5m|KtAynMCd; literal 0 HcmV?d00001 diff --git a/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py b/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py index 45962b1e..e28cc0ce 100644 --- a/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py +++ b/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py @@ -472,6 +472,24 @@ def test_tag_simplifier_block_select(self): self.assertIsNotNone(p_element.get("_item_id")) self.assertIsNotNone(p_element.get("cc-select")) + def test_tag_simplifier_unusual_encoding(self): + file_path = base_dir / 'assets/test_html_data/simplify_cases/unusual_encoding.html' + with open(file_path, 'r') as file: + raw_html = file.read() + + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html} + pre_data = PreDataJson(data_dict) + + pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) + + simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') + simple_id_count = self.check_and_find_max_item_id(simplifier_raw_html) + self.assertEqual(simple_id_count, 102) + + raw_tag_html = pre_data_result.get(PreDataJsonKey.TYPICAL_RAW_TAG_HTML, '') + tag_id_count = self.check_and_find_max_item_id(raw_tag_html) + self.assertEqual(tag_id_count, simple_id_count) + if __name__ == '__main__': unittest.main()