From a4056db6850ce8ae6175ac75576cf6e368ee6a34 Mon Sep 17 00:00:00 2001 From: Kaival Parikh Date: Wed, 29 Jan 2025 10:54:21 +0000 Subject: [PATCH 01/17] Add a Faiss wrapper for KNN searches --- gradle/generation/extract-jdk-apis.gradle | 16 +- .../extract-jdk-apis/ExtractJdkApis.java | 3 +- gradle/generation/regenerate.gradle | 1 + gradle/java/core-mrjar.gradle | 5 +- lucene/sandbox/src/generated/jdk/jdk22.apijar | Bin 0 -> 17070 bytes lucene/sandbox/src/java/module-info.java | 4 + .../faiss/FaissKnnVectorsFormatProvider.java | 92 ++++++ .../sandbox/codecs/faiss/package-info.java | 21 ++ .../codecs/faiss/FaissKnnVectorsFormat.java | 75 +++++ .../codecs/faiss/FaissKnnVectorsReader.java | 197 +++++++++++++ .../codecs/faiss/FaissKnnVectorsWriter.java | 204 +++++++++++++ .../sandbox/codecs/faiss/LibFaissC.java | 268 ++++++++++++++++++ .../org.apache.lucene.codecs.KnnVectorsFormat | 16 ++ 13 files changed, 899 insertions(+), 3 deletions(-) create mode 100644 lucene/sandbox/src/generated/jdk/jdk22.apijar create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormatProvider.java create mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/package-info.java create mode 100644 lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java create mode 100644 lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java create mode 100644 lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java create mode 100644 lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java create mode 100644 lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat diff --git a/gradle/generation/extract-jdk-apis.gradle b/gradle/generation/extract-jdk-apis.gradle index 3adde87da838..2e181811a9b2 100644 --- a/gradle/generation/extract-jdk-apis.gradle +++ b/gradle/generation/extract-jdk-apis.gradle @@ -19,9 +19,23 @@ def resources = scriptResources(buildscript) configure(project(":lucene:core")) { ext { - apijars = layout.projectDirectory.dir("src/generated/jdk") mrjarJavaVersions = [ 21 ] } +} + +configure(project(":lucene:sandbox")) { + ext { + mrjarJavaVersions = [ 22 ] + } +} + +configure([ + project(":lucene:core"), + project(":lucene:sandbox") +]) { + ext { + apijars = layout.projectDirectory.dir("src/generated/jdk") + } configurations { apiextractor diff --git a/gradle/generation/extract-jdk-apis/ExtractJdkApis.java b/gradle/generation/extract-jdk-apis/ExtractJdkApis.java index c84c8f16996d..d2d60d2545aa 100644 --- a/gradle/generation/extract-jdk-apis/ExtractJdkApis.java +++ b/gradle/generation/extract-jdk-apis/ExtractJdkApis.java @@ -54,7 +54,8 @@ public final class ExtractJdkApis { private static final String PATTERN_VECTOR_VM_INTERNALS = "java.base/jdk/internal/vm/vector/VectorSupport{,$Vector,$VectorMask,$VectorPayload,$VectorShuffle}"; static final Map> CLASSFILE_PATTERNS = Map.of( - 21, List.of(PATTERN_PANAMA_FOREIGN, PATTERN_VECTOR_VM_INTERNALS, PATTERN_VECTOR_INCUBATOR) + 21, List.of(PATTERN_PANAMA_FOREIGN, PATTERN_VECTOR_VM_INTERNALS, PATTERN_VECTOR_INCUBATOR), + 22, List.of(PATTERN_PANAMA_FOREIGN) ); public static void main(String... args) throws IOException { diff --git a/gradle/generation/regenerate.gradle b/gradle/generation/regenerate.gradle index d23cfd7d54f0..8edaabc77d80 100644 --- a/gradle/generation/regenerate.gradle +++ b/gradle/generation/regenerate.gradle @@ -91,6 +91,7 @@ configure([ project(":lucene:queryparser"), project(":lucene:expressions"), project(":lucene:test-framework"), + project(":lucene:sandbox"), ]) { task regenerate() { description "Rerun any code or static data generation tasks." diff --git a/gradle/java/core-mrjar.gradle b/gradle/java/core-mrjar.gradle index b8b116800e01..cc3ac76fba88 100644 --- a/gradle/java/core-mrjar.gradle +++ b/gradle/java/core-mrjar.gradle @@ -17,7 +17,10 @@ // Produce an MR-JAR with Java 19+ foreign and vector implementations -configure(project(":lucene:core")) { +configure([ + project(":lucene:core"), + project(":lucene:sandbox") +]) { plugins.withType(JavaPlugin) { mrjarJavaVersions.each { jdkVersion -> sourceSets.create("main${jdkVersion}") { diff --git a/lucene/sandbox/src/generated/jdk/jdk22.apijar b/lucene/sandbox/src/generated/jdk/jdk22.apijar new file mode 100644 index 0000000000000000000000000000000000000000..ae4b063641ba5a89f9b74a85848424dd9eba73b3 GIT binary patch literal 17070 zcmbVzWmFx=(k&9)U4y&3ySux)yF-Encb5=?yNBQsIJi531b4TfK_ATh-ZvSB$=vsv zwa!|HA5~}d?%Gwot6D)C^c5-)BqSsd5D=ltL!dK*JKb9ys-CsRvv zJ90#a-#b2}Gk7mrYEA;4w7QhPihavd=@CO{7;e=S2*GWI@-}yf5FWUatyNt;M zLhOJ6FQ;r<_g}X{jzozv%6Z2S&VoI;_L-!X)d04|TG5xI8_D#DVfGH6ca5o=`j{?Y zW^!AbAW^iu;kGnh&OzCR>(251d%~5_*Q}oQ%7L#2K>#g)AK-_(;VAzpCKm3`);%c* zH5gV-zWs%XOU8idp?7v$bhd5KpW_(u&CrD<2N|gbv(|je-c^`b=!+L&we(A?5kWDr zr&Oraq%^+W@OtN#VLWy&-5`%i?v-q;8j0W)bUMb7%mOKxA|2-CHvmaZ7Z+Hb!{Ceu zR%l_dCXP<5#Bv%%o2*$M)O%2RCs!UT_|Op^xF71H-E@3dhqr{%r0;{w9vmh@PaQ2C z_nrE6OwMM}k7D#jH_vZD7uB0op?az|hxj&IEUIc<9%PZ2Qm{!bWApHk%WTtU zaZ~b%LJLXrn0fh|7*n6BTy_IoIMU2hFlpL}7&LP96^!FBH9)gw{ai74XB`|-(&+2} z-b(e7-Z6c);RrhSh80Wx;x&uew;((hLaUiT3T2%Y5pW*6t7J7gXUm!yOM|XA(cXuP z0>1D~74dt(yymWS55Nl25RmX)9mN;GKtS|hKtKxrc%ojve4?C8?F^rtBv-|LIS@fa z?{22f0wTJi1`=>&4y4fxGM&T*T#1%TSYG119k$8bu(nMA!f|dSD`*%YY{;&Om}MHLog^}Sn+?f|5*-O|t|bSBh9ajYnx1de$SHksz2=zl z^*dsjX5AnXc;g7?Tt2U$m$1+$eh)&H!`kE%2`KA)(y7cm*@n@SPp*()i5$2pSA{=Q zXT>8vQ^BiwG<4*mN(UO(F!KN`gUz=cTK$}Tv?E!2xO(9)##~AomRnx&_2WNbu{k8v z!|{lP9LVpmpn4GtF;_cd7fX9P5mRSlCrbwxd#7i>lq>SegD_(HghS>~K)%Zii|9lO z(Z-cwAS3As^*88h;7DqI7nIw2=0y@2eNPJE>&HK!5iRl#(J)Imv&B9$rv6~&vX|Ts zS`Kog$wk06COmo}Is7EL%3gTIg03452DjUrRuwsrfZLEwOLudd$Ptun;TF7mI)m4J`si?-Ww!z1(tdD{WXG)PNXFc+z-5V3@4yZh!)`Wd~tk4bQ$z_ zh!VeusJN58tHaA|9X~ABA%Ge*N`S5a0J*1QQDirr4%%lRb2T>Fm7ll*0*!W+@a=Oc zS((_>6Oq z<5M!qL1aUk)yk_gX4E({j#|6q0P~FkeQr}l>eOrM!)pj@kEwMo)f%fy!6F4K+Kd(# z4rq_~&@O|bENJk6MfGy2`T*LJi{g?jNmrOTpyDvMiq|f{lY|TX(_MBIwEIY%4g!UO zHm`A(o{%6@y`X1!M56Auv4!^{5;B%{)~3(nsaAQ|?lEO%*a{0x)a2b_gi{f|+UZ9y z2t-9gl%Nx>?FyRwo)IYf_4BwFyK1zKC){FA2%i9ci4kt$7?F58u>M!()8V&ELn9gb z=^4Q7KpBu89LdaB7LvIJ7BnSDU~xH6-_ap~TlL3tO(Kh}0Zo3bs+^tpd#{EkeD00PSNq~hjvODGGqFpwsak70+mXRqku^|4Y$dn+Ysf1=OFdG zw|wA*z+jF5yzSd%-en^wt-8avA|I%GrIBt(${A#Vt;hU+SPS8BRRND(LF>-G@>$$J zaCjRrsx9qX`LzwM-uhMs${k~iw_)_;^+hEBR3k(jhJ;_YZf=;3$E^8Nm3xXWyYFL8 zgn+c-zhK{bJUbSzeiy|rS6%-(J0$WBPvrcJlB1L)+x{e*3p6Th#j zHTaLwUQ)3r)z)Y$HxWlEA8Dak!S^uylq_MNoA`_i3LCVPhbs(wW81byWt%@vbVU$a z;6+S&5qIiZvgiT|pP~0QPsU_4NU=kGG27B{9jk6Kj8klT`1VWB1>6S zym9*NwK~>(jgOe-1{9S+IN>E9qX`O#=cRgg)%j1T#4m8%6TER3TxT&s7od_G!G`3A zPD9M74+T!B6jUq9Q`7ZY=}xYN7o($20#ZlXz;8AN={tg1rRw1)JY?HE&)-_D(nZpC ziv1*~Z0yO9<;WZI#6RcUh+xsoRV~jienk{+A_88>RNZSR;?Hb8F;8KgzZCjGPpFT{ zgduds7}1l)IOy>3Q#g?!9%UlFvNc*~!pv^Syml6($?}&DdvP5}Z9|A|#pijF|aOPldKQW6gCiiIE{Vj*MKN z2c5L~f~hS1=>7awb+vJ52}9UYFT>IZeduTfgEUbnnsKV7dFv4g6#6;D1yy#Xr;HwA54Wg`qNE6Hq>H@39;^pL>WdB z?w(>@N4Qc#U#kPgO7p8>krLD`8T0dbuNlt3)2AWsr{X0J3_xnQQnB!Mth7Gjbd!FpV!##2b(vb4hYKL8I-eu`-Zb{vV4)|Wr)Maa(M zuxmO#4%De$%=xCkPW;cRNY&>N(wrSEO%4l^u4t+o!mNG@Dq zyeks%)X*Q))^0i~c9p`N0JO<(WBm1s0`fIdS-YvUW6MaBuDu#?N{kYvj#lAB#@H zoS3-B`s{o|q5pA@t1}IVrbTM?o4pCYhA9B%g!BEU>D;;!G0bS+qa5QTz_zA$=Sn<} zT@xW2C>dph;xu4vI-7a@fng_nq+Q<>E-HrrP7oRz77+CCz%NN;@7?I2= zI#YEGv;=LspFvHCcLqhocN-!;Larv5HCUsuHM~f$ovBWr_}tm^w;Y;X)E*v0lco$Y ztfje>_;|s(=1}R@sx`Fh`b8Zjg?or%xOR16<`VW;Uo0`9XU%9%sFN!E) zMy};#O%WtW4iOO;^5VO2BJlAR#C|O7eZe5~aoK9g3Z27_g?!{!OQI-COKlOpSHQOl z_vs5C^5F7X^MA-^3@v_U9vvPTIlMbP()R@wz`ma{7)91LlZ^OwZZhoHs9B$|gdS}bE#UHUhyMFIJe5d$c`Ft6ky!|M@p-_E5jqPPDjT> zV{DOrh1f(}26kv|Q=Sw30B=KOXrkUZ#njtU0vdZx*g_u4AW30Cjqy)=iXK|fC=aY0 zh-{==HG}A6SDJ8SH_6}-!s^|QG$Z>I`_lw$46LC;dQ|AFv)Zu5X{cW?O4W1^J+ULZ zQMiKb9lj1QAaovul7m3lF4e^6C);yrC~|C#^+6OyT&mga!p0Z2CYu^DGA#i}xzO~L zm5efs@4S`z2{{?6X^ux zDb=N;qb@ViWM{T2HD2JT!P1pH=m?5W0q-i2GDAc2)g)OH-5t*EO@TdKMEU_1SvW`1 zI&Lo-Gc{i&(b_;!zFhOXS+T0Lj`bYcC-HdA-izENSK#r!LZ<)Dk^pn3tnhT2&=^=9?aBS){pr zWE<KZ|y4R^|9O@=-nNPcDNhT|%Iq-P6Z>X(4$T@BJa{zmjA2gK_ zkJKU|AGSf{BS&Dt!G|Pl$bx?hj<L}`mJ!>FL)*arV_5Wuj>^=;|W2#xBh1FyFS$G~bmGPQoywO3(=AY9{i zEf=p01+q|vtJcRdJ+!hUGK)&ZihXyg$nK&SBk2O+M|C)laGbir`UaY(R4?TZC#n`q zvm)Q-$k4@XK;yCQmy2$L_OWj0o)G!14YT-fb!*I?2J50I6tl^e5dpV$9#iMix`HaH zpv9hLbaVtI-eU3wnOLV?s6n=PGX0?p;%KvgdXS#>&quSM#~30&({2Ui)l%Ol1_JxP@8GH+3MoPLQ8>E(@31-RgybN z1x*)j1>SrnKB^Lw-?oQMl8aYAM)C0$5L9jep7a4!{P@ofK%@(%J?F$7vYY<7}bm|}@f#4qqK!CO} z%nbPNwL_OvaByD^*T6-J+sslH6I zQW{tYsh^N4G_|aL#vGI(NBl)@&3p|0wvF+}&kq4xJ4mCg-p+np z#^e89{;*%+4qNI(NbJNHUq3Ir>5)2SUd6l(ZsYo{T!n@AK9tZHg(tiL?4TE^o{-PK z0o0Lvm9DE0F;j3pssZAJvug|KlJGwIo(R(v`=fUNUU&K3O|8V6tQVe;^)V0_W_!p7vUCV zNh-tuz#OPXA5kR~7`ksyuQpG9p5Fr-1VgRlB;&ldvMf3KR#!%qu;^$!SD#)Q9K}Bh z(`*Dx6zMw_B03y|5ejMkK!doAM7Q^YRJF(1I`6)dv#_i`EeRblaPfjjkb-^E3MI$F z>s&~#5ibyPKU0vwenNQldc~kLgkzTr6M|o)mkg5x6gQ+LLjGM(_TmHhQ$==o%1UzZ zsP5}Mrt*K*XkSj_3Wg>omUiYZUP+?jnxF2U!4IIMmQX2)hXjhj3|TY`uFyCX0{MLY zbE&Zg+gxqZ zxLB}k`SY#M@h+Txbarb?*gxXu_XQRJQ^TBJT7#H}Z`hx;MT2f5J}B6B35ui^%;Lc- z9Rukx_+hf}btL212xf#a)iyoVns=xBOFU1}phlqWAyf!pLT|71fE@gGjYI_Zx_WZ| z;HZ>=PSxELGt8qig%$={Qm&B?9Th9`oRf(HPOW-4Aj9HVcsfE_`(V0LLxZ8mV6%;y zwxK;MmX5rNj8lW5ST~+A;3rSYW<|v^H0YoV=FkS`N`aQcM?feZa?TV;_qeH+_&ACX zD;y~|`ux{v5@T=}Pde+8dX(j!5he<}Va&)=6}}ww61sp5r%a`Z1Q&FifT<5Tr5(b# z0*MGs=hiD(*3OiwLnztbSkb(4=s@+!q~-=7$#y+$N|ge#~lPCmJ0Ml*LTjHXGd6fK-W z3hJW{x>Pzor{$J-|A{{N0p5y%D|17h9YMM%l|f3Lfsj*EqmGh0l1j>1fV35)P`;&b zLtZq2N(pxd>FZsxMp7LmdDNK2YuFOasU5FeZs-*)I+)@SrS~;9Uu5=z`YDciZ@=K` z$6>y`ch7mSzL<2UCCQvc5S@GY+RJ;FPj1{sU3b^l`?(0$7nt~V>d?w5@d$J%X;&9prlB^&RlC8r1>Aite(lNp4n_N#O@ogO1LkKSLg;D0Cx_2Fo;F#+bl;&z7=P*aD(iF+X0ryPSI3~ z$PUu3!BBOiO2S2fbfK0o4;V-A6Zlo#t~a6Lnvp=aU{6spKK{_&`xqh4Fu!9UvVVvY zM^{rjW7C%@Na?9vgjzA=L3GXTEo!xmlPqkD0v3VL zeTwVMwxe<;8RVPE?>KI}zQ^S{B(9syyKqs6+!L z&Ws`trKH%Ry|PQTwTUmN`?GF8aqCB)2%Z;r4W-C*w}4~!_`Zcr$I5;f`Qnky2oOs3 zD(_82d7<5Or`RNtU^@WW5Lgm1`HWtg9BNPk6q~uPC$%%YN4<;)e+I0qTcgV>5{LNS ztZPu6#qRN2og2yiHtP?0?>}yOU`w(nB9AmS@LL*tS+r7iadI_w`3HE0$;fcezM|+wg8Rnv0g@{q>BicXI!ir|f!|WBy|ai=>ohfI=4M{nH(OfN z*8`rNCC3G~>7fA-je3Qm9KMNTjhEb%3fXWZV?_j{TOalSyQ4KraaC&B*%fzaSA21FTQx(^|S3T)&LdC>IQ29 zpQXwLW>A!_k#@?mk64lePlw&%BBd04Q7|kz*{#TW1Y57}@H%%L4q=T%pW>Uq)LE_X ztk)$ElsDy1lf)=pz(iu9Z3na#LCrMzr1q=VdwxQoIHYz)FbuXqbEMb|(o^H*S0BTV zRBtP!i1v$X`wrgzp%hBt3({jhZaU!qXVX1xjqGh??Cq^x9iG)U`O3O669TBMoz*HA zow{wh{K;tF;P_a3#3P~NF~dZM;;;B>>i64Sf`?=zudMm5KyMX$w6kF>SVcb6I3M-D ze=|DIe0Om-s19V&$r&)onCYnR#QMY7Rv|KxKb2gV7POpvX9~1u*`s?0p%;@2!3}fj z9oj9OYBq|N<2O}!*&$&YMvj{|2R+gVRInrJG}&fe?|cvzlA;%#(Iji4bo!#;K8dOz zTwto8zXh-C84t7|2{F-y**44>e(U!&e%_8LDN4GAiNYV48|?#(m1vwz1U|g^eHLN_lGUnJL!cOC;z%BYZ|sWSM$-HB?V&QA zXJVF9IlD@E9$Y~EFvww3p=SU;iiMfdDWvS7a%*FEMudTJHMp_l<=Bw}JqX@R5mX`Y#71W+uam}nXh^Oz8=9{xGp2$4 z=kvj_(sc?3!d~&iPzNK=Nm=$X`m7=CauYV=VzrmM8$Kn(*-r=Vf#Hu#0A6mwn zhX5a?pay`3<8?rFt9rbOp)6I_$v*Fw-=|Ec`^0Nxu3;ONiMGsroc_giS2f6+BdZCI zCgXzxZ2f?0gYdYn)+t?c4cj)Y^8Vc)=SgPhKKlIeJbn0WpW)?hp{m_uf8ifuxb?Bh z!{iaR#O{!n@{a=6c!$Z<{*{sj64fU!9Kt~x1sAta3Rgid${%7m8&M9DMU-}U_4A1P zpb~9m~Rx8x2jGmCFnh%%7O$A`)-6hP4thORRS;3C1xo+=i?@|?o+z+jO@SM|N zu=~gOr3q_Lvf=3qCKjAb%(s1qRwFR>^=1! zX7i&{$I*FK--pvJeIS>bSBL2u=4$=5Ex8lf__HH%>fb-lSJ-N7`PAnMLax&b4fTY1 z3mm!nzZYDEY2JX?;@u2wzk5K-WJehEsbBey&Eo$vPowiPn#BYRJ)USrcgYBN4a8Y8 zTQ$a?UD@Jm%$;AheIbR*!7Mb3OkSuC?@`rHN`Hv3xGzWFlM7dp+ZO-^t>0kx-J032 z-Ct#-v&c(F&wK(Mb-Hn^fDW~;r^NeITE3(MHwn^RqXTyYGv7i&P{ko)#_7j!5ZOKp zhK>mbaEKC7hDJLoo-m^Gx4}nHFW?zhIQv5NIYjk6G9(QgVWAu4)Eg28JZDG?BxDbs zK%yCrsT&nmEJ1&ybKeHDjW_7wylTT2RaR>6R?<}Ijf*ZtSm)e}%+QcE@Prs!ScLj3 z!JHgp1PUN1D^mvNlDu>czDjtQt8vuRddn~Ad)F$jbi0b)E|*Fx(UH6_P#m`gK}hgi zxj#QGx80qy{LxQKY$+4B`(g6}-abksS&MXnDO{~Qt|nGuwdZV*dFZ=X+K+X;vq5ck z8D3S--D61{^_uF&W-EuQZV^3?b`m)e@|HI3$*5hiawYWC-BH#kU6&cdlMD%o?2*Dl)?l>&toI4`0jB zFTLwhhsNN`2pmjCilxV`E2ix#y9Qd1`Ml3$Dq>~OHZ>zN6d3w1|Y5AlXhyP^Kk2c9 zy!FEe{;qx{`EBM(^PB!Lb*T%cE8I<&y#b{!D8vApA*koaD}`?}97-2((fy6lDEIgN zP%R+{4k1`a6Fw5!S&RD?=+0fIUG-rYq4$i5=CjE)&sD67aP=3VwkzH%)R zEh0Q(Ei$)YCn;T=tCgYZp_$0$p>2eGro0IvSWHeT{U~Uu3;At+G!JOo{il-ANLVd5 z^iOQyIPJVy^C*;y{!c@YNBSMi}2U?MLKVdl_Bdr~;7Go)`y-Bs8} zxv3Dhsbf1~cm{hIhytSLeC!hERbsVma1>($8-=if1LmhG&_A{3%a z7<%LxR)a>MxNuzeP*_`!nT2xRj{+su-@Tce={R?37B}@hScY(6S$E){D4IwaBTk5q0OayD33M3rp^X2N*g8UFr-e-y6G$F1j*iAnZ z|HUm~Sp&i;O`6~|fYRmLMjZVer_yzvXjZR+;;)#h;i5*NMA4Q;Gn9B;Y!U%{IhjxJ zHiCE+zj-8ksNW{nmxrsK!RzVrT$1{jF!&#Af%9}WHg(5CX$DGQT!`()qA*{fAYZD2 z)6^+XJ|P=~uq=!vFAW-uSZsIgKJxaVQKm3c{|Fx_jl-xY`b=Gy*_Ck$ysk4Ud+gBF zd1&ss3vbZM-GMVs2iUl8wEwU(lwS%=G5ANN512LVx#EK+bKrXP*f5sqb-Vm5TuE&i zhO&PLVrB)_R23F#9;VVM3v3v>FrTYSM0I-Uf#+x59ZaLyqT0@(B?E~!o)k>7`6by= zV58fu?6O8QbR@#EtxyN@UiTG~TkxHt4E_u@!*ds%E9OOIG|~!pM};C4YpM`JL*IOx z$d~$m)Y&(hawlM^YJLZRxL`XqfN2+yzH+aSS#WKXb}#AF^|Wn1Bb9ca{{&0v*~aeU zkq+V*3~7{gBh9bL)f3yzC6Fe(I0F*?CAkWC{X@;-8EXWszVqxwZQw@(U-r-=uz5by z=vBWA(Gs+5seTV+sJuABIvGcLH;LWzxT`;uvD99@NS8-YPk!6r`forLwlI7?;qxb` z`8r7v?xMui6s2@yQ4NCZMu;*J^9T$EpFXC!jU8Ljy#>M_@X*3$?6Gj2u(6j;8BE({LI^m%oY;Nc^HAMG_ zpLsXZoaT#ab00D7yL!Y_K7TJR3T(-PPdUdD$rz>>)9PB)#CPD*eIYxGH$4bNVv_MT zQIO*rHc5XFS>@p-xzF`#PAHV~r~KtQ_w|`M(l|mHq=&voOLCG(Q)#p8_xx;94EnvGx<5 zykod*AiCXVS)?{WNA;aoa(xF%n;aM0nt57`C5cjbu_EA_MuVuA#y5FB`2Frrq3R%$ zQ0)4zsM7owOe|vWYGm{GV0|2)Q$P$F1<&##TbA#e%FK;U?8;mr5!Bd zpxl!F|AuQJm_H)XasfH--R97c?ale!2~<|7cko*DML}^e*>|hK>c&O%-2^iiPt83| zT~8I;1+9f902)JZS7`ZH?iNC{D=a2oG71f&KKC^G5;~IlATwCXPS74h^Vv$1FWAti z!DE%3u^-q)0ZhU!JmF4@u0M6gWd<6DlEjH;^2#IJR`Ph3I8iZ)?2tY0swU?_@+Irf zn#euJ2?o}Mn9?^6F(e?Hp;VjuZG%ezctZ6m>{FjHzL9W$W)<9u$V69tJ;a4>^yA@u z<}Q3;%w4vKhiix|lLdLQ;*uBApmNxJi_Hefc!1coY zKKm1_qOEVIC~;wl10i#|%=+A0d6nFw$Nm^E=yv|P;q?eE@Naqe<=((wd05QG-thTk z!N&w?`9DS@LzNKmdEWA`QB&GAA*y$QrD3u{l93&XV^wi{Oxwv?scd{wTO;f9!x_a& zKV+PpHE%6)+v@9OW&lrBjZ{r;Ak4?yu?HyT>#zjLe>}UEr)>5~8N#r@#WvTv={iD0Zi()qqvAVVIR)0D7`XCFr~;gzS*xK#6n<_|v` z%&{rjY%KUy(!@P(Lo<^H!>D0>9F&^N%Y>YgsHKs>)VqgBc~|YF7;icr{Y(C$VSDl` z5LINl%Y2(v70T=w5(E&WG52Xif6fz7uatb4Y~C5n@uq#=vv`U>jJni=a&_e2(>$P}$Gr-fc8mGE1wK*9^Nfx=)z=ipK&Z#GNmrh;vCxjkUF zN(Pc!CX-0&kBI@B)jo$_B%aemU7J?#f9Q^R%*U`V(79t#C7cB9Z2O3sLg-~R)KCg; z`SKG=y{6@cGopK7=Gl;n=>4))w-#NX;<)n4c}%o@W7DWV4ROBp6YS_3p0_;QVZU~7 z?q#a)a)qr_e{;EEGc$M&EHz{Rp%lM5(NJs$wm_Pgm?b)AzGoTed#VL{3Ry#COKfea z)7cE3rT16OD0VSaAyx7-xaV5gawitJj3lunbpl_GUdLWLcYXOoRvcs>R&aiVbmzB_ z{x|tm#@^2Sna=fRe$6A$Yw%EKp^B*91z8BiamfV-bUk94jXw9EdiZ~c?LrVkLpOoJYVLj9> zC72R@&#g5Kir~EDgK1GyI$X#SQ9zaq`Aggm__(oIg3mlgvwDZK4C4$P(}D?4tx9{7 zcbtNmTRCkn^dz-4&aIe7Ze4?QVnc}5RGRSSG$11}h`Vv%8R;;>vrCq`F3drFwJ#b> zIAX{U>&#zKb~@Z*MQDCU2Q#vY%ivc^j|n!0CdEjj2U?&h{B02! zsxYk~Tpznd8{RGxo6uu|Eh4oMU8>K$lIwa*up}22v`Xc#R0j$^_Ys7UJ|SWUEkigh)4%*z7aWmo5I z_$kHQxwj7a6YZExC^h4|r)-Uk!5x*r*qc}bcO$umP~G_q2aKi=?e4QTR_e|fHgMPh z=-9AcQ?PI_usimGu>Ba)yz^=B=0)4*{VlSjwBx0TyKy5~LZ5sPHFL}2@miVo)b(J| zD!;$t)ESc^zji(IOT-_GeNBfo(n%V)f$d{vFj^&cGTQU%ru6dSeSiJ3CVwXL>P9 z8`H-!`02~DNR3jlm4BRF&B%wy&!-9}LKPI$s{rHajv*xp3f;*^BN9Wos-c~$c3NoO zw^hA`AiF2x|3j5-q)v3ZjP+^yW}flh)6DGh{`T%12Wa3pt^L;gI*9Atjoex!G@xbd z+ZNv39^Qliq|N(8hGn@%JSs_?5+c-uY7`2+qX=mv4aT)-OzuL7xmJalDMgS{vU6Hh za~Y>`?BxQeGSWO-g?t$&GDTtiH-oI$Q@6^=!B8MOgP{JOZ0Q=l6UJ7<;6rKnw7&u# zt+EAF*Qn`_+f7NUUu<4Cb8{l^o zko)AMG9GY>9oA&;fVmF_CH&wri=@wrEI2tRdbB$zrOCFZX)%wdH0Dw;#LK45ghVGV zZhJWWMRC_oWSfhLz=~lmXaj5FF{U|m89m3L+Q?uBFX-M#5I6@?74J&SyH~t}ZQL4X zFxgJ?=oJdO-jfObN)+h_Mu{2V*tZ(P7E{>lfKZOl0ChROmt_m=A99Rbj^u&(29I#! zRO3~_$|wJ!GvZWhY%^B5#n1s9MNeWY_zcXxmS7sp^g{40EE=mmWS*H=Qw_#zEmoGj zKI~agT@eYvHB>|0y&6ayZ1c%w8+a$O!PyxYxc4|B4(63(BRL<1Z{SS~_9jwp{^;MY zast>=9`#%OKc-cIK~RDId=vTW+qS3ah`+A?w-=HM(tox6>szg-*KvO?rN>|Mg6$XH z%00LF>uaH>ociZle9XUpxB0JczMeby^%2Yy2mHCJ9-sgH-N8RUL3(ca*C!HBk7)i} zT#vx~-SUf1YMvYYbrS!t__;lL_$Q+;zqNR7``0Pwr?%*y%j8l2`@8MmO!hxF{p-~2 z)0p(1%LM-4O#fzR`nj85hphg(JM@VE?B?Zx-siS|oj7_b1O8lv=>KZ_#i6a|9)6v> z`RlL0cpMu0`~LrKj_JAWU#AtGCU^c^N?0%1{+BtP=jMOywLgtK{JA`EUpD`n0f^_u zf9)y%^;}}%y=eTOoA=L6|Jq*stLbR`7ft_ji~70gUpr}kC6-0PmrVb2xAM8=Usd_P zT9zYz$?`7}!LuUfx#eF4;U`u9&!t2DFP8sN4&+Y$upZY5b4ov@U|AnjZa|gec^?%(Wah88~@X{sixrbltsK3S;A>03W c_%AOD3esSYqrX5v*pGj~k9&>G`Sk7o0Gg2#RsaA1 literal 0 HcmV?d00001 diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java index f40a05af433a..5655829a7a1f 100644 --- a/lucene/sandbox/src/java/module-info.java +++ b/lucene/sandbox/src/java/module-info.java @@ -20,8 +20,10 @@ requires org.apache.lucene.core; requires org.apache.lucene.queries; requires org.apache.lucene.facet; + requires java.logging; exports org.apache.lucene.payloads; + exports org.apache.lucene.sandbox.codecs.faiss; exports org.apache.lucene.sandbox.codecs.idversion; exports org.apache.lucene.sandbox.codecs.quantization; exports org.apache.lucene.sandbox.document; @@ -37,4 +39,6 @@ provides org.apache.lucene.codecs.PostingsFormat with org.apache.lucene.sandbox.codecs.idversion.IDVersionPostingsFormat; + provides org.apache.lucene.codecs.KnnVectorsFormat with + org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormatProvider; } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormatProvider.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormatProvider.java new file mode 100644 index 000000000000..768d8fb0ed89 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormatProvider.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.codecs.faiss; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.lang.invoke.MethodType; +import java.util.Arrays; +import java.util.logging.Logger; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.KnnVectorsReader; +import org.apache.lucene.codecs.KnnVectorsWriter; +import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; + +/** + * Wraps Faiss to create and search vector + * indexes. This class is mainly for backwards compatibility with older versions of Java (<22), use + * underlying format directly after upgrade. + * + * @lucene.experimental + */ +public class FaissKnnVectorsFormatProvider extends KnnVectorsFormat { + private final KnnVectorsFormat delegate; + + public FaissKnnVectorsFormatProvider() { + this(new Object[0]); + } + + public FaissKnnVectorsFormatProvider(Object... args) { + super(FaissKnnVectorsFormatProvider.class.getSimpleName()); + + KnnVectorsFormat delegate; + try { + Class cls = + MethodHandles.lookup() + .findClass("org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat"); + + MethodType methodType = + MethodType.methodType( + void.class, Arrays.stream(args).map(Object::getClass).toArray(Class[]::new)); + + delegate = + (KnnVectorsFormat) + MethodHandles.lookup().findConstructor(cls, methodType).invokeWithArguments(args); + + } catch ( + @SuppressWarnings("unused") + ClassNotFoundException e) { + + delegate = new Lucene99HnswVectorsFormat(); + Logger.getLogger(getClass().getName()) + .warning("FaissKnnVectorsFormat class missing, falling back to " + delegate); + + } catch (NoSuchMethodException | IllegalAccessException e) { + throw new LinkageError("FaissKnnVectorsFormat is missing correctly typed constructor", e); + } catch (Throwable t) { + throw new RuntimeException(t); + } + this.delegate = delegate; + } + + @Override + public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException { + return delegate.fieldsWriter(state); + } + + @Override + public KnnVectorsReader fieldsReader(SegmentReadState state) throws IOException { + return delegate.fieldsReader(state); + } + + @Override + public int getMaxDimensions(String fieldName) { + return delegate.getMaxDimensions(fieldName); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/package-info.java new file mode 100644 index 000000000000..e8938dfb3782 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/package-info.java @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Wraps Faiss to create and search vector + * indexes via {@link org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormatProvider}. + */ +package org.apache.lucene.sandbox.codecs.faiss; diff --git a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java new file mode 100644 index 000000000000..141f8b7d613c --- /dev/null +++ b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.codecs.faiss; + +import java.io.IOException; +import java.util.Locale; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.KnnVectorsReader; +import org.apache.lucene.codecs.KnnVectorsWriter; +import org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil; +import org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsFormat; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; + +public final class FaissKnnVectorsFormat extends KnnVectorsFormat { + public static final String NAME = FaissKnnVectorsFormat.class.getSimpleName(); + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + static final String META_CODEC_NAME = NAME + "Meta"; + static final String DATA_CODEC_NAME = NAME + "Data"; + static final String META_EXTENSION = "faissm"; + static final String DATA_EXTENSION = "faissd"; + + private final String description; + private final String indexParams; + private final KnnVectorsFormat rawVectorsFormat; + + public FaissKnnVectorsFormat() { + this("HNSW32", "efConstruction=200"); + } + + public FaissKnnVectorsFormat(String description, String indexParams) { + super(NAME); + this.description = description; + this.indexParams = indexParams; + this.rawVectorsFormat = + new Lucene99FlatVectorsFormat(FlatVectorScorerUtil.getLucene99FlatVectorsScorer()); + } + + @Override + public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException { + return new FaissKnnVectorsWriter( + description, indexParams, state, rawVectorsFormat.fieldsWriter(state)); + } + + @Override + public KnnVectorsReader fieldsReader(SegmentReadState state) throws IOException { + return new FaissKnnVectorsReader(state, rawVectorsFormat.fieldsReader(state)); + } + + @Override + public int getMaxDimensions(String fieldName) { + return DEFAULT_MAX_DIMENSIONS; + } + + @Override + public String toString() { + return String.format( + Locale.ROOT, "%s(description=%s indexParams=%s)", NAME, description, indexParams); + } +} diff --git a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java new file mode 100644 index 000000000000..4584a91155e4 --- /dev/null +++ b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java @@ -0,0 +1,197 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.codecs.faiss; + +import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.DATA_CODEC_NAME; +import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.DATA_EXTENSION; +import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.META_CODEC_NAME; +import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.META_EXTENSION; +import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.NAME; +import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.VERSION_CURRENT; +import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.VERSION_START; +import static org.apache.lucene.sandbox.codecs.faiss.LibFaissC.indexRead; +import static org.apache.lucene.sandbox.codecs.faiss.LibFaissC.indexSearch; + +import java.io.IOException; +import java.lang.foreign.Arena; +import java.lang.foreign.MemorySegment; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.HashMap; +import java.util.Map; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.KnnVectorsReader; +import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.OutputStreamDataOutput; +import org.apache.lucene.store.ReadAdvice; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.IOUtils; + +public final class FaissKnnVectorsReader extends KnnVectorsReader { + private final KnnVectorsReader rawVectorsReader; + private final IndexInput meta, data; + private final Map indexMap; + private final Arena arena; + + public FaissKnnVectorsReader(SegmentReadState state, KnnVectorsReader rawVectorsReader) + throws IOException { + this.rawVectorsReader = rawVectorsReader; + this.indexMap = new HashMap<>(); + this.arena = Arena.ofConfined(); + + boolean failure = true; + try { + meta = + openInput( + state, + META_EXTENSION, + META_CODEC_NAME, + VERSION_START, + VERSION_CURRENT, + state.context); + data = + openInput( + state, + DATA_EXTENSION, + DATA_CODEC_NAME, + VERSION_START, + VERSION_CURRENT, + state.context.withReadAdvice(ReadAdvice.RANDOM)); + + Map.Entry entry; + while ((entry = parseNextField(state)) != null) { + this.indexMap.put(entry.getKey(), entry.getValue()); + } + + failure = false; + } finally { + if (failure) { + IOUtils.closeWhileHandlingException(this); + } + } + } + + @SuppressWarnings("SameParameterValue") + private IndexInput openInput( + SegmentReadState state, + String extension, + String codecName, + int versionStart, + int versionEnd, + IOContext context) + throws IOException { + + String fileName = + IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, extension); + IndexInput input = state.directory.openInput(fileName, context); + CodecUtil.checkIndexHeader( + input, codecName, versionStart, versionEnd, state.segmentInfo.getId(), state.segmentSuffix); + return input; + } + + private Map.Entry parseNextField(SegmentReadState state) + throws IOException { + int fieldNumber = meta.readInt(); + if (fieldNumber == -1) { + return null; + } + + FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNumber); + if (fieldInfo == null) { + throw new IllegalStateException("invalid field"); + } + + int size = meta.readInt(); + int[] ordToDoc = new int[size]; + for (int i = 0; i < size; i++) { + ordToDoc[i] = meta.readInt(); + } + + long dataOffset = meta.readLong(); + long dataLength = meta.readLong(); + + // Copy index to temp file + // TODO: Non FS-based approach? + Path tempFile = Files.createTempFile(NAME, fieldInfo.name); + try (OutputStreamDataOutput output = + new OutputStreamDataOutput(Files.newOutputStream(tempFile))) { + data.seek(dataOffset); + output.copyBytes(data, dataLength); + } + + // Read index from temp file into memory + // See flags defined in c_api/index_io_c.h + MemorySegment indexPointer = + indexRead(tempFile.toString(), 3) + // Assign index to explicit scope for timely cleanup + .reinterpret(arena, LibFaissC::freeIndex); + + // Cleanup + Files.delete(tempFile); + + return Map.entry(fieldInfo.name, new LibFaissC.Index(indexPointer, ordToDoc)); + } + + @Override + public void checkIntegrity() throws IOException { + rawVectorsReader.checkIntegrity(); + CodecUtil.checksumEntireFile(meta); + CodecUtil.checksumEntireFile(data); + } + + @Override + public FloatVectorValues getFloatVectorValues(String field) throws IOException { + return rawVectorsReader.getFloatVectorValues(field); + } + + @Override + public ByteVectorValues getByteVectorValues(String field) { + throw new UnsupportedOperationException("Byte vectors not supported"); + } + + @Override + public void search(String field, float[] vector, KnnCollector knnCollector, Bits acceptDocs) { + LibFaissC.Index entry = indexMap.get(field); + if (entry != null) { + indexSearch(entry.indexPointer(), entry.ordToDoc(), vector, knnCollector, acceptDocs); + } + } + + @Override + public void search(String field, byte[] vector, KnnCollector knnCollector, Bits acceptDocs) { + throw new UnsupportedOperationException("Byte vectors not supported"); + } + + @Override + public void close() throws IOException { + rawVectorsReader.close(); + arena.close(); + if (meta != null) { + meta.close(); + } + if (data != null) { + data.close(); + } + } +} diff --git a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java new file mode 100644 index 000000000000..2848e83d5f3d --- /dev/null +++ b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java @@ -0,0 +1,204 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.codecs.faiss; + +import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.DATA_CODEC_NAME; +import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.DATA_EXTENSION; +import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.META_CODEC_NAME; +import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.META_EXTENSION; +import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.NAME; +import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.VERSION_CURRENT; +import static org.apache.lucene.sandbox.codecs.faiss.LibFaissC.createIndex; +import static org.apache.lucene.sandbox.codecs.faiss.LibFaissC.indexWrite; + +import java.io.IOException; +import java.lang.foreign.Arena; +import java.lang.foreign.MemorySegment; +import java.nio.file.Files; +import java.nio.file.Path; +import org.apache.lucene.codecs.BufferingKnnVectorsWriter; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.KnnFieldVectorsWriter; +import org.apache.lucene.codecs.KnnVectorsWriter; +import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.MergeState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Sorter; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.InputStreamDataInput; +import org.apache.lucene.util.IOUtils; + +public final class FaissKnnVectorsWriter extends BufferingKnnVectorsWriter { + private final String description, indexParams; + private final KnnVectorsWriter rawVectorsWriter; + private final IndexOutput meta, data; + private boolean finished; + + public FaissKnnVectorsWriter( + String description, + String indexParams, + SegmentWriteState state, + KnnVectorsWriter rawVectorsWriter) + throws IOException { + this.description = description; + this.indexParams = indexParams; + this.rawVectorsWriter = rawVectorsWriter; + this.finished = false; + + boolean failure = true; + try { + this.meta = openOutput(state, META_EXTENSION, META_CODEC_NAME); + this.data = openOutput(state, DATA_EXTENSION, DATA_CODEC_NAME); + failure = false; + } finally { + if (failure) { + IOUtils.closeWhileHandlingException(this); + } + } + } + + private IndexOutput openOutput(SegmentWriteState state, String extension, String codecName) + throws IOException { + String fileName = + IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, extension); + IndexOutput output = state.directory.createOutput(fileName, state.context); + CodecUtil.writeIndexHeader( + output, codecName, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + return output; + } + + @Override + @SuppressWarnings("unchecked") + public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException { + return switch (fieldInfo.getVectorEncoding()) { + case BYTE -> throw new UnsupportedOperationException("Byte vectors not supported"); + case FLOAT32 -> + new KnnFieldVectorsWriter() { + private final KnnFieldVectorsWriter rawWriter = + (KnnFieldVectorsWriter) rawVectorsWriter.addField(fieldInfo); + private final KnnFieldVectorsWriter writer = + (KnnFieldVectorsWriter) FaissKnnVectorsWriter.super.addField(fieldInfo); + + @Override + public long ramBytesUsed() { + return rawWriter.ramBytesUsed() + writer.ramBytesUsed(); + } + + @Override + public void addValue(int i, float[] floats) throws IOException { + rawWriter.addValue(i, floats); + writer.addValue(i, floats); + } + + @Override + public float[] copyValue(float[] floats) { + return floats.clone(); + } + }; + }; + } + + @Override + public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { + rawVectorsWriter.mergeOneField(fieldInfo, mergeState); + super.mergeOneField(fieldInfo, mergeState); + } + + @Override + protected void writeField(FieldInfo fieldInfo, FloatVectorValues floatVectorValues, int maxDoc) + throws IOException { + int number = fieldInfo.number; + meta.writeInt(number); + + VectorSimilarityFunction function = fieldInfo.getVectorSimilarityFunction(); + int size = floatVectorValues.size(); + + // TODO: Non FS-based approach? + Path tempFile = Files.createTempFile(NAME, fieldInfo.name); + int[] ordToDoc; + + // Write index to temp file and deallocate from memory + try (Arena temp = Arena.ofConfined()) { + LibFaissC.Index result = createIndex(description, indexParams, function, floatVectorValues); + MemorySegment localIndex = + result + .indexPointer() + // Assign index to explicit scope for timely cleanup + .reinterpret(temp, LibFaissC::freeIndex); + indexWrite(localIndex, tempFile.toString()); + ordToDoc = result.ordToDoc(); + } + + // Write ordinal map + meta.writeInt(size); + for (int doc : ordToDoc) { + meta.writeInt(doc); + } + + // Copy temp file to index + long dataOffset = data.getFilePointer(); + try (InputStreamDataInput input = new InputStreamDataInput(Files.newInputStream(tempFile))) { + data.copyBytes(input, Files.size(tempFile)); + } + long dataLength = data.getFilePointer() - dataOffset; + + // Cleanup temp file + Files.delete(tempFile); + + meta.writeLong(dataOffset); + meta.writeLong(dataLength); + } + + @Override + protected void writeField(FieldInfo fieldInfo, ByteVectorValues byteVectorValues, int maxDoc) { + throw new UnsupportedOperationException("Byte vectors not supported"); + } + + @Override + public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { + rawVectorsWriter.flush(maxDoc, sortMap); + super.flush(maxDoc, sortMap); + } + + @Override + public void finish() throws IOException { + if (finished) { + throw new IllegalStateException("already finished"); + } + finished = true; + + rawVectorsWriter.finish(); + meta.writeInt(-1); + CodecUtil.writeFooter(meta); + CodecUtil.writeFooter(data); + } + + @Override + public void close() throws IOException { + rawVectorsWriter.close(); + if (meta != null) { + meta.close(); + } + if (data != null) { + data.close(); + } + } +} diff --git a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java new file mode 100644 index 000000000000..bf89d74806a7 --- /dev/null +++ b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java @@ -0,0 +1,268 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.codecs.faiss; + +import static java.lang.foreign.ValueLayout.ADDRESS; +import static java.lang.foreign.ValueLayout.JAVA_FLOAT; +import static java.lang.foreign.ValueLayout.JAVA_INT; +import static java.lang.foreign.ValueLayout.JAVA_LONG; + +import java.lang.foreign.Arena; +import java.lang.foreign.FunctionDescriptor; +import java.lang.foreign.Linker; +import java.lang.foreign.MemoryLayout; +import java.lang.foreign.MemorySegment; +import java.lang.foreign.SymbolLookup; +import java.lang.invoke.MethodHandle; +import java.nio.ByteOrder; +import java.nio.FloatBuffer; +import java.util.Locale; +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.KnnVectorValues; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.util.Bits; + +public final class LibFaissC { + public static final String LIBRARY_VERSION = "1.9.0"; + + static { + try { + System.loadLibrary("faiss_c"); + } catch (UnsatisfiedLinkError e) { + throw new RuntimeException( + "Shared library not found, build the Faiss C_API from https://github.com/facebookresearch/faiss/blob/main/c_api/INSTALL.md " + + "and link it (along with all dependencies) to the library path " + + "(-Djava.library.path JVM argument or $LD_LIBRARY_PATH environment variable)", + e); + } + checkLibraryVersion(); + } + + private LibFaissC() {} + + private static MethodHandle getMethodHandle( + String functionName, MemoryLayout resLayout, MemoryLayout... argLayouts) { + return Linker.nativeLinker() + .downcallHandle( + SymbolLookup.loaderLookup().find(functionName).orElseThrow(), + FunctionDescriptor.of(resLayout, argLayouts)); + } + + private static void checkLibraryVersion() { + MethodHandle getVersion = getMethodHandle("faiss_get_version", ADDRESS); + String actualVersion = callAndGetString(getVersion); + if (LIBRARY_VERSION.equals(actualVersion) == false) { + throw new UnsupportedOperationException( + String.format( + Locale.ROOT, + "Expected Faiss library version %s, found %s", + LIBRARY_VERSION, + actualVersion)); + } + } + + private static final MethodHandle FREE_INDEX = + getMethodHandle("faiss_Index_free", JAVA_INT, ADDRESS); + + public static void freeIndex(MemorySegment indexPointer) { + callAndHandleError(FREE_INDEX, indexPointer); + } + + private static final MethodHandle FREE_PARAMETER_SPACE = + getMethodHandle("faiss_ParameterSpace_free", JAVA_INT, ADDRESS); + + private static void freeParameterSpace(MemorySegment parameterSpacePointer) { + callAndHandleError(FREE_PARAMETER_SPACE, parameterSpacePointer); + } + + private static final MethodHandle INDEX_FACTORY = + getMethodHandle("faiss_index_factory", JAVA_INT, ADDRESS, JAVA_INT, ADDRESS, JAVA_INT); + + private static final MethodHandle PARAMETER_SPACE_NEW = + getMethodHandle("faiss_ParameterSpace_new", JAVA_INT, ADDRESS); + + private static final MethodHandle SET_INDEX_PARAMETERS = + getMethodHandle( + "faiss_ParameterSpace_set_index_parameters", JAVA_INT, ADDRESS, ADDRESS, ADDRESS); + + private static final MethodHandle INDEX_IS_TRAINED = + getMethodHandle("faiss_Index_is_trained", JAVA_INT, ADDRESS); + + private static final MethodHandle INDEX_TRAIN = + getMethodHandle("faiss_Index_train", JAVA_INT, ADDRESS, JAVA_LONG, ADDRESS); + + private static final MethodHandle INDEX_ADD = + getMethodHandle("faiss_Index_add", JAVA_INT, ADDRESS, JAVA_LONG, ADDRESS); + + public record Index(MemorySegment indexPointer, int[] ordToDoc) {} + + public static Index createIndex( + String description, + String indexParams, + VectorSimilarityFunction function, + FloatVectorValues floatVectorValues) { + + try (Arena temp = Arena.ofConfined()) { + int size = floatVectorValues.size(); + int dimension = floatVectorValues.dimension(); + + // Mapped from faiss/MetricType.h + int metric = + switch (function) { + case DOT_PRODUCT -> 0; + case EUCLIDEAN -> 1; + default -> throw new UnsupportedOperationException("metric type not supported"); + }; + + // Create an index + MemorySegment pointer = temp.allocate(ADDRESS); + callAndHandleError(INDEX_FACTORY, pointer, dimension, temp.allocateFrom(description), metric); + MemorySegment indexPointer = pointer.get(ADDRESS, 0); + + // Set index params + callAndHandleError(PARAMETER_SPACE_NEW, pointer); + MemorySegment parameterSpacePointer = + pointer.get(ADDRESS, 0).reinterpret(temp, LibFaissC::freeParameterSpace); + callAndHandleError( + SET_INDEX_PARAMETERS, + parameterSpacePointer, + indexPointer, + temp.allocateFrom(indexParams)); + + // Allocate docs in native memory + MemorySegment docs = temp.allocate(JAVA_FLOAT, (long) size * dimension); + FloatBuffer docsBuffer = docs.asByteBuffer().order(ByteOrder.nativeOrder()).asFloatBuffer(); + + int[] ordToDoc = new int[size]; + KnnVectorValues.DocIndexIterator iterator = floatVectorValues.iterator(); + for (int i = 0; i < size; i++) { + ordToDoc[i] = iterator.nextDoc(); + docsBuffer.put(floatVectorValues.vectorValue(iterator.index())); + } + + // Train index + if ((int) INDEX_IS_TRAINED.invokeExact(indexPointer) == 0) { + callAndHandleError(INDEX_TRAIN, indexPointer, size, docs); + } + + // Add docs to index + callAndHandleError(INDEX_ADD, indexPointer, size, docs); + + return new Index(indexPointer, ordToDoc); + } catch (Throwable t) { + throw new RuntimeException(t); + } + } + + private static final MethodHandle INDEX_WRITE = + getMethodHandle("faiss_write_index_fname", JAVA_INT, ADDRESS, ADDRESS); + + public static void indexWrite(MemorySegment indexPointer, String fileName) { + try (Arena temp = Arena.ofConfined()) { + callAndHandleError(INDEX_WRITE, indexPointer, temp.allocateFrom(fileName)); + } + } + + private static final MethodHandle INDEX_READ = + getMethodHandle("faiss_read_index_fname", JAVA_INT, ADDRESS, JAVA_INT, ADDRESS); + + public static MemorySegment indexRead(String fileName, int ioFlags) { + try (Arena temp = Arena.ofConfined()) { + MemorySegment pointer = temp.allocate(ADDRESS); + callAndHandleError(INDEX_READ, temp.allocateFrom(fileName), ioFlags, pointer); + return pointer.get(ADDRESS, 0); + } + } + + private static final MethodHandle INDEX_SEARCH = + getMethodHandle( + "faiss_Index_search", JAVA_INT, ADDRESS, JAVA_LONG, ADDRESS, JAVA_INT, ADDRESS, ADDRESS); + + public static void indexSearch( + MemorySegment indexPointer, + int[] ordToDoc, + float[] query, + KnnCollector knnCollector, + Bits acceptDocs) { + try (Arena temp = Arena.ofConfined()) { + // Allocate queries in native memory + MemorySegment queries = temp.allocate(JAVA_FLOAT, query.length); + queries.asByteBuffer().order(ByteOrder.nativeOrder()).asFloatBuffer().put(query); + + // Faiss knn search + int k = knnCollector.k(); + MemorySegment distancesPointer = temp.allocate(JAVA_FLOAT, k); + MemorySegment idsPointer = temp.allocate(JAVA_LONG, k); + + MemorySegment localIndex = indexPointer.reinterpret(temp, null); + callAndHandleError(INDEX_SEARCH, localIndex, 1, queries, k, distancesPointer, idsPointer); + + // Retrieve scores + float[] distances = new float[k]; + distancesPointer.asByteBuffer().order(ByteOrder.nativeOrder()).asFloatBuffer().get(distances); + + // Retrieve ids + long[] ids = new long[k]; + idsPointer.asByteBuffer().order(ByteOrder.nativeOrder()).asLongBuffer().get(ids); + + // Record hits + for (int i = 0; i < k; i++) { + int ord = (int) ids[i]; + if (ord < 0) { + break; + } + + // TODO: This is like a post-filter, include at runtime? + int doc = ordToDoc[ord]; + if (acceptDocs == null || acceptDocs.get(doc)) { + knnCollector.collect(doc, distances[i]); + } + } + } + } + + private static final MethodHandle GET_LAST_ERROR = + getMethodHandle("faiss_get_last_error", ADDRESS); + + private static String callAndGetString(MethodHandle handle, Object... args) { + try { + MemorySegment segment = (MemorySegment) handle.invokeWithArguments(args); + return segment.reinterpret(Long.MAX_VALUE).getString(0); + } catch (Throwable e) { + throw new RuntimeException(e); + } + } + + private static void callAndHandleError(MethodHandle handle, Object... args) { + try { + int returnCode = (int) handle.invokeWithArguments(args); + if (returnCode < 0) { + String error = callAndGetString(GET_LAST_ERROR); + throw new FaissException(error); + } + } catch (Throwable t) { + throw new RuntimeException(t); + } + } + + public static class FaissException extends Exception { + public FaissException(String message) { + super(message); + } + } +} diff --git a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat new file mode 100644 index 000000000000..418d70fb51fd --- /dev/null +++ b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormatProvider From e53d1a0ab932f1cb4cf8eb9d0effe2a4031ba03c Mon Sep 17 00:00:00 2001 From: Kaival Parikh Date: Wed, 29 Jan 2025 14:45:12 +0000 Subject: [PATCH 02/17] Minor changes - Fix javadocs - Fallback to null if underlying format is not present --- .../codecs/faiss/FaissKnnVectorsFormatProvider.java | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormatProvider.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormatProvider.java index 768d8fb0ed89..52729ac1ba9d 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormatProvider.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormatProvider.java @@ -24,14 +24,13 @@ import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.KnnVectorsWriter; -import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; /** * Wraps Faiss to create and search vector - * indexes. This class is mainly for backwards compatibility with older versions of Java (<22), use - * underlying format directly after upgrade. + * indexes. This class is mainly for backwards compatibility with older versions of Java (<22), + * use underlying format directly after upgrade. * * @lucene.experimental */ @@ -63,9 +62,9 @@ public FaissKnnVectorsFormatProvider(Object... args) { @SuppressWarnings("unused") ClassNotFoundException e) { - delegate = new Lucene99HnswVectorsFormat(); + delegate = null; Logger.getLogger(getClass().getName()) - .warning("FaissKnnVectorsFormat class missing, falling back to " + delegate); + .warning("FaissKnnVectorsFormat class missing, this object is unusable!"); } catch (NoSuchMethodException | IllegalAccessException e) { throw new LinkageError("FaissKnnVectorsFormat is missing correctly typed constructor", e); From 714ea4f0b4311de686a313794dda9b1a5bc02db2 Mon Sep 17 00:00:00 2001 From: Kaival Parikh Date: Thu, 30 Jan 2025 09:35:12 +0000 Subject: [PATCH 03/17] Use IDMap to store doc ids in Faiss - Create an index using `add_with_ids` instead of `add` - Remove `ordToDoc` from both indexing and search flows - Some misc changes and refactoring --- .../codecs/faiss/FaissKnnVectorsFormat.java | 3 +- .../codecs/faiss/FaissKnnVectorsReader.java | 24 +++---- .../codecs/faiss/FaissKnnVectorsWriter.java | 30 ++++---- .../sandbox/codecs/faiss/LibFaissC.java | 69 +++++++++---------- 4 files changed, 59 insertions(+), 67 deletions(-) diff --git a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java index 141f8b7d613c..6a5bf3db27c7 100644 --- a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java +++ b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java @@ -39,8 +39,9 @@ public final class FaissKnnVectorsFormat extends KnnVectorsFormat { private final String indexParams; private final KnnVectorsFormat rawVectorsFormat; + @SuppressWarnings("unused") public FaissKnnVectorsFormat() { - this("HNSW32", "efConstruction=200"); + this("IDMap,HNSW32", "efConstruction=200"); } public FaissKnnVectorsFormat(String description, String indexParams) { diff --git a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java index 4584a91155e4..836974d43b1a 100644 --- a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java +++ b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java @@ -51,7 +51,7 @@ public final class FaissKnnVectorsReader extends KnnVectorsReader { private final KnnVectorsReader rawVectorsReader; private final IndexInput meta, data; - private final Map indexMap; + private final Map indexMap; private final Arena arena; public FaissKnnVectorsReader(SegmentReadState state, KnnVectorsReader rawVectorsReader) @@ -79,7 +79,7 @@ public FaissKnnVectorsReader(SegmentReadState state, KnnVectorsReader rawVectors VERSION_CURRENT, state.context.withReadAdvice(ReadAdvice.RANDOM)); - Map.Entry entry; + Map.Entry entry; while ((entry = parseNextField(state)) != null) { this.indexMap.put(entry.getKey(), entry.getValue()); } @@ -110,7 +110,7 @@ private IndexInput openInput( return input; } - private Map.Entry parseNextField(SegmentReadState state) + private Map.Entry parseNextField(SegmentReadState state) throws IOException { int fieldNumber = meta.readInt(); if (fieldNumber == -1) { @@ -119,13 +119,7 @@ private Map.Entry parseNextField(SegmentReadState state FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNumber); if (fieldInfo == null) { - throw new IllegalStateException("invalid field"); - } - - int size = meta.readInt(); - int[] ordToDoc = new int[size]; - for (int i = 0; i < size; i++) { - ordToDoc[i] = meta.readInt(); + throw new IllegalStateException("Invalid field"); } long dataOffset = meta.readLong(); @@ -150,7 +144,7 @@ private Map.Entry parseNextField(SegmentReadState state // Cleanup Files.delete(tempFile); - return Map.entry(fieldInfo.name, new LibFaissC.Index(indexPointer, ordToDoc)); + return Map.entry(fieldInfo.name, indexPointer); } @Override @@ -167,19 +161,23 @@ public FloatVectorValues getFloatVectorValues(String field) throws IOException { @Override public ByteVectorValues getByteVectorValues(String field) { + // TODO: Support using SQ8 quantization, see + // https://github.com/opensearch-project/k-NN/pull/2425 throw new UnsupportedOperationException("Byte vectors not supported"); } @Override public void search(String field, float[] vector, KnnCollector knnCollector, Bits acceptDocs) { - LibFaissC.Index entry = indexMap.get(field); + MemorySegment entry = indexMap.get(field); if (entry != null) { - indexSearch(entry.indexPointer(), entry.ordToDoc(), vector, knnCollector, acceptDocs); + indexSearch(entry, vector, knnCollector, acceptDocs); } } @Override public void search(String field, byte[] vector, KnnCollector knnCollector, Bits acceptDocs) { + // TODO: Support using SQ8 quantization, see + // https://github.com/opensearch-project/k-NN/pull/2425 throw new UnsupportedOperationException("Byte vectors not supported"); } diff --git a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java index 2848e83d5f3d..94dbe715d1be 100644 --- a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java +++ b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java @@ -89,7 +89,11 @@ private IndexOutput openOutput(SegmentWriteState state, String extension, String @SuppressWarnings("unchecked") public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException { return switch (fieldInfo.getVectorEncoding()) { - case BYTE -> throw new UnsupportedOperationException("Byte vectors not supported"); + case BYTE -> + // TODO: Support using SQ8 quantization, see + // https://github.com/opensearch-project/k-NN/pull/2425 + throw new UnsupportedOperationException("Byte vectors not supported"); + case FLOAT32 -> new KnnFieldVectorsWriter() { private final KnnFieldVectorsWriter rawWriter = @@ -128,29 +132,17 @@ protected void writeField(FieldInfo fieldInfo, FloatVectorValues floatVectorValu int number = fieldInfo.number; meta.writeInt(number); - VectorSimilarityFunction function = fieldInfo.getVectorSimilarityFunction(); - int size = floatVectorValues.size(); - // TODO: Non FS-based approach? Path tempFile = Files.createTempFile(NAME, fieldInfo.name); - int[] ordToDoc; // Write index to temp file and deallocate from memory try (Arena temp = Arena.ofConfined()) { - LibFaissC.Index result = createIndex(description, indexParams, function, floatVectorValues); - MemorySegment localIndex = - result - .indexPointer() + VectorSimilarityFunction function = fieldInfo.getVectorSimilarityFunction(); + MemorySegment indexPointer = + createIndex(description, indexParams, function, floatVectorValues) // Assign index to explicit scope for timely cleanup .reinterpret(temp, LibFaissC::freeIndex); - indexWrite(localIndex, tempFile.toString()); - ordToDoc = result.ordToDoc(); - } - - // Write ordinal map - meta.writeInt(size); - for (int doc : ordToDoc) { - meta.writeInt(doc); + indexWrite(indexPointer, tempFile.toString()); } // Copy temp file to index @@ -169,6 +161,8 @@ protected void writeField(FieldInfo fieldInfo, FloatVectorValues floatVectorValu @Override protected void writeField(FieldInfo fieldInfo, ByteVectorValues byteVectorValues, int maxDoc) { + // TODO: Support using SQ8 quantization, see + // https://github.com/opensearch-project/k-NN/pull/2425 throw new UnsupportedOperationException("Byte vectors not supported"); } @@ -181,7 +175,7 @@ public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { @Override public void finish() throws IOException { if (finished) { - throw new IllegalStateException("already finished"); + throw new IllegalStateException("Already finished"); } finished = true; diff --git a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java index bf89d74806a7..9a3efa1eb6ff 100644 --- a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java +++ b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java @@ -21,6 +21,7 @@ import static java.lang.foreign.ValueLayout.JAVA_INT; import static java.lang.foreign.ValueLayout.JAVA_LONG; +import java.io.IOException; import java.lang.foreign.Arena; import java.lang.foreign.FunctionDescriptor; import java.lang.foreign.Linker; @@ -30,6 +31,7 @@ import java.lang.invoke.MethodHandle; import java.nio.ByteOrder; import java.nio.FloatBuffer; +import java.nio.LongBuffer; import java.util.Locale; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.KnnVectorValues; @@ -38,11 +40,12 @@ import org.apache.lucene.util.Bits; public final class LibFaissC { + public static final String LIBRARY_NAME = "faiss_c"; public static final String LIBRARY_VERSION = "1.9.0"; static { try { - System.loadLibrary("faiss_c"); + System.loadLibrary(LIBRARY_NAME); } catch (UnsatisfiedLinkError e) { throw new RuntimeException( "Shared library not found, build the Faiss C_API from https://github.com/facebookresearch/faiss/blob/main/c_api/INSTALL.md " @@ -106,16 +109,15 @@ private static void freeParameterSpace(MemorySegment parameterSpacePointer) { private static final MethodHandle INDEX_TRAIN = getMethodHandle("faiss_Index_train", JAVA_INT, ADDRESS, JAVA_LONG, ADDRESS); - private static final MethodHandle INDEX_ADD = - getMethodHandle("faiss_Index_add", JAVA_INT, ADDRESS, JAVA_LONG, ADDRESS); + private static final MethodHandle INDEX_ADD_WITH_IDS = + getMethodHandle("faiss_Index_add_with_ids", JAVA_INT, ADDRESS, JAVA_LONG, ADDRESS, ADDRESS); - public record Index(MemorySegment indexPointer, int[] ordToDoc) {} - - public static Index createIndex( + public static MemorySegment createIndex( String description, String indexParams, VectorSimilarityFunction function, - FloatVectorValues floatVectorValues) { + FloatVectorValues floatVectorValues) + throws IOException { try (Arena temp = Arena.ofConfined()) { int size = floatVectorValues.size(); @@ -126,7 +128,7 @@ public static Index createIndex( switch (function) { case DOT_PRODUCT -> 0; case EUCLIDEAN -> 1; - default -> throw new UnsupportedOperationException("metric type not supported"); + default -> throw new UnsupportedOperationException("Metric type not supported"); }; // Create an index @@ -148,24 +150,25 @@ public static Index createIndex( MemorySegment docs = temp.allocate(JAVA_FLOAT, (long) size * dimension); FloatBuffer docsBuffer = docs.asByteBuffer().order(ByteOrder.nativeOrder()).asFloatBuffer(); - int[] ordToDoc = new int[size]; + // Allocate ids in native memory + MemorySegment ids = temp.allocate(JAVA_LONG, size); + LongBuffer idsBuffer = ids.asByteBuffer().order(ByteOrder.nativeOrder()).asLongBuffer(); + KnnVectorValues.DocIndexIterator iterator = floatVectorValues.iterator(); for (int i = 0; i < size; i++) { - ordToDoc[i] = iterator.nextDoc(); - docsBuffer.put(floatVectorValues.vectorValue(iterator.index())); + idsBuffer.put(iterator.nextDoc()); + docsBuffer.put(floatVectorValues.vectorValue(i)); } // Train index - if ((int) INDEX_IS_TRAINED.invokeExact(indexPointer) == 0) { + if (callAndGetInt(INDEX_IS_TRAINED, indexPointer) == 0) { callAndHandleError(INDEX_TRAIN, indexPointer, size, docs); } // Add docs to index - callAndHandleError(INDEX_ADD, indexPointer, size, docs); + callAndHandleError(INDEX_ADD_WITH_IDS, indexPointer, size, docs, ids); - return new Index(indexPointer, ordToDoc); - } catch (Throwable t) { - throw new RuntimeException(t); + return indexPointer; } } @@ -194,11 +197,7 @@ public static MemorySegment indexRead(String fileName, int ioFlags) { "faiss_Index_search", JAVA_INT, ADDRESS, JAVA_LONG, ADDRESS, JAVA_INT, ADDRESS, ADDRESS); public static void indexSearch( - MemorySegment indexPointer, - int[] ordToDoc, - float[] query, - KnnCollector knnCollector, - Bits acceptDocs) { + MemorySegment indexPointer, float[] query, KnnCollector knnCollector, Bits acceptDocs) { try (Arena temp = Arena.ofConfined()) { // Allocate queries in native memory MemorySegment queries = temp.allocate(JAVA_FLOAT, query.length); @@ -222,13 +221,9 @@ public static void indexSearch( // Record hits for (int i = 0; i < k; i++) { - int ord = (int) ids[i]; - if (ord < 0) { - break; - } + int doc = (int) ids[i]; // TODO: This is like a post-filter, include at runtime? - int doc = ordToDoc[ord]; if (acceptDocs == null || acceptDocs.get(doc)) { knnCollector.collect(doc, distances[i]); } @@ -239,6 +234,14 @@ public static void indexSearch( private static final MethodHandle GET_LAST_ERROR = getMethodHandle("faiss_get_last_error", ADDRESS); + private static int callAndGetInt(MethodHandle handle, Object... args) { + try { + return (int) handle.invokeWithArguments(args); + } catch (Throwable e) { + throw new RuntimeException(e); + } + } + private static String callAndGetString(MethodHandle handle, Object... args) { try { MemorySegment segment = (MemorySegment) handle.invokeWithArguments(args); @@ -249,18 +252,14 @@ private static String callAndGetString(MethodHandle handle, Object... args) { } private static void callAndHandleError(MethodHandle handle, Object... args) { - try { - int returnCode = (int) handle.invokeWithArguments(args); - if (returnCode < 0) { - String error = callAndGetString(GET_LAST_ERROR); - throw new FaissException(error); - } - } catch (Throwable t) { - throw new RuntimeException(t); + int returnCode = callAndGetInt(handle, args); + if (returnCode < 0) { + String error = callAndGetString(GET_LAST_ERROR); + throw new FaissException(error); } } - public static class FaissException extends Exception { + public static class FaissException extends RuntimeException { public FaissException(String message) { super(message); } From 1d0f666e6c548e4de0cbdc7291e8ebd31b46e849 Mon Sep 17 00:00:00 2001 From: Kaival Parikh Date: Fri, 31 Jan 2025 12:19:38 +0000 Subject: [PATCH 04/17] Reduce index time RAM usage - Move away from BufferingKnnVectorsWriter and use vectors exposed by the FlatVectorsWriter - Read disk-backed vectors at merge time --- .../codecs/faiss/FaissKnnVectorsFormat.java | 3 +- .../codecs/faiss/FaissKnnVectorsReader.java | 5 +- .../codecs/faiss/FaissKnnVectorsWriter.java | 156 ++++++++++++------ .../sandbox/codecs/faiss/LibFaissC.java | 6 +- 4 files changed, 114 insertions(+), 56 deletions(-) diff --git a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java index 6a5bf3db27c7..1e7fabcd05bc 100644 --- a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java +++ b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java @@ -22,6 +22,7 @@ import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.KnnVectorsWriter; import org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil; +import org.apache.lucene.codecs.hnsw.FlatVectorsFormat; import org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsFormat; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; @@ -37,7 +38,7 @@ public final class FaissKnnVectorsFormat extends KnnVectorsFormat { private final String description; private final String indexParams; - private final KnnVectorsFormat rawVectorsFormat; + private final FlatVectorsFormat rawVectorsFormat; @SuppressWarnings("unused") public FaissKnnVectorsFormat() { diff --git a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java index 836974d43b1a..a1cebfa86ad4 100644 --- a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java +++ b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java @@ -35,6 +35,7 @@ import java.util.Map; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnVectorsReader; +import org.apache.lucene.codecs.hnsw.FlatVectorsReader; import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FloatVectorValues; @@ -49,12 +50,12 @@ import org.apache.lucene.util.IOUtils; public final class FaissKnnVectorsReader extends KnnVectorsReader { - private final KnnVectorsReader rawVectorsReader; + private final FlatVectorsReader rawVectorsReader; private final IndexInput meta, data; private final Map indexMap; private final Arena arena; - public FaissKnnVectorsReader(SegmentReadState state, KnnVectorsReader rawVectorsReader) + public FaissKnnVectorsReader(SegmentReadState state, FlatVectorsReader rawVectorsReader) throws IOException { this.rawVectorsReader = rawVectorsReader; this.indexMap = new HashMap<>(); diff --git a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java index 94dbe715d1be..ef15066897f5 100644 --- a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java +++ b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java @@ -26,15 +26,19 @@ import static org.apache.lucene.sandbox.codecs.faiss.LibFaissC.indexWrite; import java.io.IOException; +import java.io.UncheckedIOException; import java.lang.foreign.Arena; import java.lang.foreign.MemorySegment; import java.nio.file.Files; import java.nio.file.Path; -import org.apache.lucene.codecs.BufferingKnnVectorsWriter; +import java.util.HashMap; +import java.util.List; +import java.util.Map; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnFieldVectorsWriter; import org.apache.lucene.codecs.KnnVectorsWriter; -import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.codecs.hnsw.FlatFieldVectorsWriter; +import org.apache.lucene.codecs.hnsw.FlatVectorsWriter; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexFileNames; @@ -42,25 +46,30 @@ import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.Sorter; import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.search.DocIdSet; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.InputStreamDataInput; import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.hnsw.IntToIntFunction; -public final class FaissKnnVectorsWriter extends BufferingKnnVectorsWriter { +public final class FaissKnnVectorsWriter extends KnnVectorsWriter { private final String description, indexParams; - private final KnnVectorsWriter rawVectorsWriter; + private final FlatVectorsWriter rawVectorsWriter; private final IndexOutput meta, data; + private final Map> rawFields; private boolean finished; public FaissKnnVectorsWriter( String description, String indexParams, SegmentWriteState state, - KnnVectorsWriter rawVectorsWriter) + FlatVectorsWriter rawVectorsWriter) throws IOException { + this.description = description; this.indexParams = indexParams; this.rawVectorsWriter = rawVectorsWriter; + this.rawFields = new HashMap<>(); this.finished = false; boolean failure = true; @@ -86,48 +95,59 @@ private IndexOutput openOutput(SegmentWriteState state, String extension, String } @Override - @SuppressWarnings("unchecked") - public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException { - return switch (fieldInfo.getVectorEncoding()) { + public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { + rawVectorsWriter.mergeOneField(fieldInfo, mergeState); + switch (fieldInfo.getVectorEncoding()) { case BYTE -> // TODO: Support using SQ8 quantization, see // https://github.com/opensearch-project/k-NN/pull/2425 throw new UnsupportedOperationException("Byte vectors not supported"); - - case FLOAT32 -> - new KnnFieldVectorsWriter() { - private final KnnFieldVectorsWriter rawWriter = - (KnnFieldVectorsWriter) rawVectorsWriter.addField(fieldInfo); - private final KnnFieldVectorsWriter writer = - (KnnFieldVectorsWriter) FaissKnnVectorsWriter.super.addField(fieldInfo); - - @Override - public long ramBytesUsed() { - return rawWriter.ramBytesUsed() + writer.ramBytesUsed(); - } - - @Override - public void addValue(int i, float[] floats) throws IOException { - rawWriter.addValue(i, floats); - writer.addValue(i, floats); - } - - @Override - public float[] copyValue(float[] floats) { - return floats.clone(); - } - }; - }; + case FLOAT32 -> { + FloatVectorValues merged = + KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState); + writeFloatField(fieldInfo, merged, doc -> doc); + } + } } @Override - public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { - rawVectorsWriter.mergeOneField(fieldInfo, mergeState); - super.mergeOneField(fieldInfo, mergeState); + public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException { + FlatFieldVectorsWriter rawFieldVectorsWriter = rawVectorsWriter.addField(fieldInfo); + rawFields.put(fieldInfo, rawFieldVectorsWriter); + return rawFieldVectorsWriter; } @Override - protected void writeField(FieldInfo fieldInfo, FloatVectorValues floatVectorValues, int maxDoc) + public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { + rawVectorsWriter.flush(maxDoc, sortMap); + for (Map.Entry> entry : rawFields.entrySet()) { + FieldInfo fieldInfo = entry.getKey(); + switch (fieldInfo.getVectorEncoding()) { + case BYTE -> + // TODO: Support using SQ8 quantization, see + // https://github.com/opensearch-project/k-NN/pull/2425 + throw new UnsupportedOperationException("Byte vectors not supported"); + + case FLOAT32 -> { + @SuppressWarnings("unchecked") + FlatFieldVectorsWriter rawWriter = + (FlatFieldVectorsWriter) entry.getValue(); + + List vectors = rawWriter.getVectors(); + int dimension = fieldInfo.getVectorDimension(); + DocIdSet docIdSet = rawWriter.getDocsWithFieldSet(); + + writeFloatField( + fieldInfo, + new BufferedFloatVectorValues(vectors, dimension, docIdSet), + (sortMap != null) ? sortMap::oldToNew : doc -> doc); + } + } + } + } + + private void writeFloatField( + FieldInfo fieldInfo, FloatVectorValues floatVectorValues, IntToIntFunction oldToNewDocId) throws IOException { int number = fieldInfo.number; meta.writeInt(number); @@ -139,7 +159,7 @@ protected void writeField(FieldInfo fieldInfo, FloatVectorValues floatVectorValu try (Arena temp = Arena.ofConfined()) { VectorSimilarityFunction function = fieldInfo.getVectorSimilarityFunction(); MemorySegment indexPointer = - createIndex(description, indexParams, function, floatVectorValues) + createIndex(description, indexParams, function, floatVectorValues, oldToNewDocId) // Assign index to explicit scope for timely cleanup .reinterpret(temp, LibFaissC::freeIndex); indexWrite(indexPointer, tempFile.toString()); @@ -159,19 +179,6 @@ protected void writeField(FieldInfo fieldInfo, FloatVectorValues floatVectorValu meta.writeLong(dataLength); } - @Override - protected void writeField(FieldInfo fieldInfo, ByteVectorValues byteVectorValues, int maxDoc) { - // TODO: Support using SQ8 quantization, see - // https://github.com/opensearch-project/k-NN/pull/2425 - throw new UnsupportedOperationException("Byte vectors not supported"); - } - - @Override - public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { - rawVectorsWriter.flush(maxDoc, sortMap); - super.flush(maxDoc, sortMap); - } - @Override public void finish() throws IOException { if (finished) { @@ -195,4 +202,51 @@ public void close() throws IOException { data.close(); } } + + @Override + public long ramBytesUsed() { + // TODO: How to estimate Faiss usage? + return rawVectorsWriter.ramBytesUsed(); + } + + private static class BufferedFloatVectorValues extends FloatVectorValues { + private final List floats; + private final int dimension; + private final DocIdSet docIdSet; + + public BufferedFloatVectorValues(List floats, int dimension, DocIdSet docIdSet) { + this.floats = floats; + this.dimension = dimension; + this.docIdSet = docIdSet; + } + + @Override + public float[] vectorValue(int ord) { + return floats.get(ord); + } + + @Override + public int dimension() { + return dimension; + } + + @Override + public int size() { + return floats.size(); + } + + @Override + public FloatVectorValues copy() { + return new BufferedFloatVectorValues(floats, dimension, docIdSet); + } + + @Override + public DocIndexIterator iterator() { + try { + return fromDISI(docIdSet.iterator()); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + } } diff --git a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java index 9a3efa1eb6ff..187e5ee07366 100644 --- a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java +++ b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java @@ -38,6 +38,7 @@ import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.util.Bits; +import org.apache.lucene.util.hnsw.IntToIntFunction; public final class LibFaissC { public static final String LIBRARY_NAME = "faiss_c"; @@ -116,7 +117,8 @@ public static MemorySegment createIndex( String description, String indexParams, VectorSimilarityFunction function, - FloatVectorValues floatVectorValues) + FloatVectorValues floatVectorValues, + IntToIntFunction oldToNewDocId) throws IOException { try (Arena temp = Arena.ofConfined()) { @@ -156,7 +158,7 @@ public static MemorySegment createIndex( KnnVectorValues.DocIndexIterator iterator = floatVectorValues.iterator(); for (int i = 0; i < size; i++) { - idsBuffer.put(iterator.nextDoc()); + idsBuffer.put(oldToNewDocId.apply(iterator.nextDoc())); docsBuffer.put(floatVectorValues.vectorValue(i)); } From 07c7d7579208e9f8bcc915825fd3389c39e0a826 Mon Sep 17 00:00:00 2001 From: Kaival Parikh Date: Tue, 4 Feb 2025 18:34:03 +0000 Subject: [PATCH 05/17] Filter documents during graph search - Add TODOs for additional changes - Also bump faiss_c to v1.10.0 --- .../sandbox/codecs/faiss/LibFaissC.java | 98 +++++++++++++++++-- 1 file changed, 88 insertions(+), 10 deletions(-) diff --git a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java index 187e5ee07366..929eec2b4dbd 100644 --- a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java +++ b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java @@ -38,11 +38,12 @@ import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.util.Bits; +import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.hnsw.IntToIntFunction; public final class LibFaissC { public static final String LIBRARY_NAME = "faiss_c"; - public static final String LIBRARY_VERSION = "1.9.0"; + public static final String LIBRARY_VERSION = "1.10.0"; static { try { @@ -94,6 +95,20 @@ private static void freeParameterSpace(MemorySegment parameterSpacePointer) { callAndHandleError(FREE_PARAMETER_SPACE, parameterSpacePointer); } + private static final MethodHandle FREE_ID_SELECTOR_BITMAP = + getMethodHandle("faiss_IDSelectorBitmap_free", JAVA_INT, ADDRESS); + + private static void freeIDSelectorBitmap(MemorySegment idSelectorBitmapPointer) { + callAndHandleError(FREE_ID_SELECTOR_BITMAP, idSelectorBitmapPointer); + } + + private static final MethodHandle FREE_SEARCH_PARAMETERS = + getMethodHandle("faiss_SearchParameters_free", JAVA_INT, ADDRESS); + + private static void freeSearchParameters(MemorySegment searchParametersPointer) { + callAndHandleError(FREE_SEARCH_PARAMETERS, searchParametersPointer); + } + private static final MethodHandle INDEX_FACTORY = getMethodHandle("faiss_index_factory", JAVA_INT, ADDRESS, JAVA_INT, ADDRESS, JAVA_INT); @@ -104,6 +119,13 @@ private static void freeParameterSpace(MemorySegment parameterSpacePointer) { getMethodHandle( "faiss_ParameterSpace_set_index_parameters", JAVA_INT, ADDRESS, ADDRESS, ADDRESS); + // TODO: Requires https://github.com/facebookresearch/faiss/pull/4158 + private static final MethodHandle ID_SELECTOR_BITMAP_NEW = + getMethodHandle("faiss_IDSelectorBitmap_new", JAVA_INT, ADDRESS, JAVA_LONG, ADDRESS); + + private static final MethodHandle SEARCH_PARAMETERS_NEW = + getMethodHandle("faiss_SearchParameters_new", JAVA_INT, ADDRESS, ADDRESS); + private static final MethodHandle INDEX_IS_TRAINED = getMethodHandle("faiss_Index_is_trained", JAVA_INT, ADDRESS); @@ -141,7 +163,11 @@ public static MemorySegment createIndex( // Set index params callAndHandleError(PARAMETER_SPACE_NEW, pointer); MemorySegment parameterSpacePointer = - pointer.get(ADDRESS, 0).reinterpret(temp, LibFaissC::freeParameterSpace); + pointer + .get(ADDRESS, 0) + // Ensure timely cleanup + .reinterpret(temp, LibFaissC::freeParameterSpace); + callAndHandleError( SET_INDEX_PARAMETERS, parameterSpacePointer, @@ -196,11 +222,31 @@ public static MemorySegment indexRead(String fileName, int ioFlags) { private static final MethodHandle INDEX_SEARCH = getMethodHandle( - "faiss_Index_search", JAVA_INT, ADDRESS, JAVA_LONG, ADDRESS, JAVA_INT, ADDRESS, ADDRESS); + "faiss_Index_search", JAVA_INT, ADDRESS, JAVA_LONG, ADDRESS, JAVA_LONG, ADDRESS, ADDRESS); + + private static final MethodHandle INDEX_SEARCH_WITH_PARAMS = + getMethodHandle( + "faiss_Index_search_with_params", + JAVA_INT, + ADDRESS, + JAVA_LONG, + ADDRESS, + JAVA_LONG, + ADDRESS, + ADDRESS, + ADDRESS); public static void indexSearch( MemorySegment indexPointer, float[] query, KnnCollector knnCollector, Bits acceptDocs) { try (Arena temp = Arena.ofConfined()) { + FixedBitSet fixedBitSet = + switch (acceptDocs) { + case null -> null; + case FixedBitSet bitSet -> bitSet; + // TODO: Add optimized case for SparseFixedBitSet + case Bits bits -> FixedBitSet.copyOf(bits); + }; + // Allocate queries in native memory MemorySegment queries = temp.allocate(JAVA_FLOAT, query.length); queries.asByteBuffer().order(ByteOrder.nativeOrder()).asFloatBuffer().put(query); @@ -211,7 +257,44 @@ public static void indexSearch( MemorySegment idsPointer = temp.allocate(JAVA_LONG, k); MemorySegment localIndex = indexPointer.reinterpret(temp, null); - callAndHandleError(INDEX_SEARCH, localIndex, 1, queries, k, distancesPointer, idsPointer); + if (fixedBitSet == null) { + // Search without runtime filters + callAndHandleError(INDEX_SEARCH, localIndex, 1, queries, k, distancesPointer, idsPointer); + } else { + MemorySegment pointer = temp.allocate(ADDRESS); + + long[] bits = fixedBitSet.getBits(); + MemorySegment nativeBits = temp.allocate(JAVA_LONG, bits.length); + + // Use LITTLE_ENDIAN to convert long[] -> uint8_t* + nativeBits.asByteBuffer().order(ByteOrder.LITTLE_ENDIAN).asLongBuffer().put(bits); + + callAndHandleError(ID_SELECTOR_BITMAP_NEW, pointer, fixedBitSet.length(), nativeBits); + MemorySegment idSelectorBitmapPointer = + pointer + .get(ADDRESS, 0) + // Ensure timely cleanup + .reinterpret(temp, LibFaissC::freeIDSelectorBitmap); + + callAndHandleError(SEARCH_PARAMETERS_NEW, pointer, idSelectorBitmapPointer); + MemorySegment searchParametersPointer = + pointer + .get(ADDRESS, 0) + // Ensure timely cleanup + .reinterpret(temp, LibFaissC::freeSearchParameters); + + // Search with runtime filters + // TODO: Requires https://github.com/facebookresearch/faiss/pull/4167 + callAndHandleError( + INDEX_SEARCH_WITH_PARAMS, + localIndex, + 1, + queries, + k, + searchParametersPointer, + distancesPointer, + idsPointer); + } // Retrieve scores float[] distances = new float[k]; @@ -223,12 +306,7 @@ public static void indexSearch( // Record hits for (int i = 0; i < k; i++) { - int doc = (int) ids[i]; - - // TODO: This is like a post-filter, include at runtime? - if (acceptDocs == null || acceptDocs.get(doc)) { - knnCollector.collect(doc, distances[i]); - } + knnCollector.collect((int) ids[i], distances[i]); } } } From 789dda6e4ac008de3a452a2a7deb2fb2a06a7150 Mon Sep 17 00:00:00 2001 From: Kaival Parikh Date: Sun, 9 Feb 2025 11:48:20 +0000 Subject: [PATCH 06/17] Use custom readers and writers for index IO - Avoid temp files for reading and writing indexes --- .../codecs/faiss/FaissKnnVectorsReader.java | 24 +-- .../codecs/faiss/FaissKnnVectorsWriter.java | 30 ++-- .../sandbox/codecs/faiss/LibFaissC.java | 164 +++++++++++++++--- 3 files changed, 152 insertions(+), 66 deletions(-) diff --git a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java index a1cebfa86ad4..d78514d99a5e 100644 --- a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java +++ b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java @@ -20,7 +20,6 @@ import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.DATA_EXTENSION; import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.META_CODEC_NAME; import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.META_EXTENSION; -import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.NAME; import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.VERSION_CURRENT; import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.VERSION_START; import static org.apache.lucene.sandbox.codecs.faiss.LibFaissC.indexRead; @@ -29,8 +28,6 @@ import java.io.IOException; import java.lang.foreign.Arena; import java.lang.foreign.MemorySegment; -import java.nio.file.Files; -import java.nio.file.Path; import java.util.HashMap; import java.util.Map; import org.apache.lucene.codecs.CodecUtil; @@ -44,7 +41,6 @@ import org.apache.lucene.search.KnnCollector; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.OutputStreamDataOutput; import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; @@ -126,25 +122,15 @@ private Map.Entry parseNextField(SegmentReadState state) long dataOffset = meta.readLong(); long dataLength = meta.readLong(); - // Copy index to temp file - // TODO: Non FS-based approach? - Path tempFile = Files.createTempFile(NAME, fieldInfo.name); - try (OutputStreamDataOutput output = - new OutputStreamDataOutput(Files.newOutputStream(tempFile))) { - data.seek(dataOffset); - output.copyBytes(data, dataLength); - } - - // Read index from temp file into memory // See flags defined in c_api/index_io_c.h + int ioFlags = 3; + + // Read index into memory MemorySegment indexPointer = - indexRead(tempFile.toString(), 3) - // Assign index to explicit scope for timely cleanup + indexRead(data.slice(fieldInfo.name, dataOffset, dataLength), ioFlags) + // Ensure timely cleanup .reinterpret(arena, LibFaissC::freeIndex); - // Cleanup - Files.delete(tempFile); - return Map.entry(fieldInfo.name, indexPointer); } diff --git a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java index ef15066897f5..3044092b871d 100644 --- a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java +++ b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java @@ -20,7 +20,6 @@ import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.DATA_EXTENSION; import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.META_CODEC_NAME; import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.META_EXTENSION; -import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.NAME; import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.VERSION_CURRENT; import static org.apache.lucene.sandbox.codecs.faiss.LibFaissC.createIndex; import static org.apache.lucene.sandbox.codecs.faiss.LibFaissC.indexWrite; @@ -29,8 +28,6 @@ import java.io.UncheckedIOException; import java.lang.foreign.Arena; import java.lang.foreign.MemorySegment; -import java.nio.file.Files; -import java.nio.file.Path; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -48,7 +45,6 @@ import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.store.InputStreamDataInput; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.hnsw.IntToIntFunction; @@ -152,31 +148,25 @@ private void writeFloatField( int number = fieldInfo.number; meta.writeInt(number); - // TODO: Non FS-based approach? - Path tempFile = Files.createTempFile(NAME, fieldInfo.name); - // Write index to temp file and deallocate from memory try (Arena temp = Arena.ofConfined()) { VectorSimilarityFunction function = fieldInfo.getVectorSimilarityFunction(); MemorySegment indexPointer = createIndex(description, indexParams, function, floatVectorValues, oldToNewDocId) - // Assign index to explicit scope for timely cleanup + // Ensure timely cleanup .reinterpret(temp, LibFaissC::freeIndex); - indexWrite(indexPointer, tempFile.toString()); - } - // Copy temp file to index - long dataOffset = data.getFilePointer(); - try (InputStreamDataInput input = new InputStreamDataInput(Files.newInputStream(tempFile))) { - data.copyBytes(input, Files.size(tempFile)); - } - long dataLength = data.getFilePointer() - dataOffset; + // See flags defined in c_api/index_io_c.h + int ioFlags = 3; - // Cleanup temp file - Files.delete(tempFile); + // Write index + long dataOffset = data.getFilePointer(); + indexWrite(indexPointer, data, ioFlags); + long dataLength = data.getFilePointer() - dataOffset; - meta.writeLong(dataOffset); - meta.writeLong(dataLength); + meta.writeLong(dataOffset); + meta.writeLong(dataLength); + } } @Override diff --git a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java index 929eec2b4dbd..0a22266256bc 100644 --- a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java +++ b/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java @@ -29,6 +29,8 @@ import java.lang.foreign.MemorySegment; import java.lang.foreign.SymbolLookup; import java.lang.invoke.MethodHandle; +import java.lang.invoke.MethodHandles; +import java.lang.invoke.MethodType; import java.nio.ByteOrder; import java.nio.FloatBuffer; import java.nio.LongBuffer; @@ -37,11 +39,20 @@ import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.Bits; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.hnsw.IntToIntFunction; public final class LibFaissC { + /* + * TODO: Requires some changes to Faiss + * - https://github.com/facebookresearch/faiss/pull/4158 (merged in main, to be released in v1.11.0) + * - https://github.com/facebookresearch/faiss/pull/4167 (merged in main, to be released in v1.11.0) + * - https://github.com/facebookresearch/faiss/pull/4180 (in progress) + */ + public static final String LIBRARY_NAME = "faiss_c"; public static final String LIBRARY_VERSION = "1.10.0"; @@ -60,7 +71,14 @@ public final class LibFaissC { private LibFaissC() {} - private static MethodHandle getMethodHandle( + @SuppressWarnings("SameParameterValue") + private static MemorySegment getUpcallStub( + Arena arena, MethodHandle target, MemoryLayout resLayout, MemoryLayout... argLayouts) { + return Linker.nativeLinker() + .upcallStub(target, FunctionDescriptor.of(resLayout, argLayouts), arena); + } + + private static MethodHandle getDowncallHandle( String functionName, MemoryLayout resLayout, MemoryLayout... argLayouts) { return Linker.nativeLinker() .downcallHandle( @@ -69,7 +87,7 @@ private static MethodHandle getMethodHandle( } private static void checkLibraryVersion() { - MethodHandle getVersion = getMethodHandle("faiss_get_version", ADDRESS); + MethodHandle getVersion = getDowncallHandle("faiss_get_version", ADDRESS); String actualVersion = callAndGetString(getVersion); if (LIBRARY_VERSION.equals(actualVersion) == false) { throw new UnsupportedOperationException( @@ -82,58 +100,71 @@ private static void checkLibraryVersion() { } private static final MethodHandle FREE_INDEX = - getMethodHandle("faiss_Index_free", JAVA_INT, ADDRESS); + getDowncallHandle("faiss_Index_free", JAVA_INT, ADDRESS); public static void freeIndex(MemorySegment indexPointer) { callAndHandleError(FREE_INDEX, indexPointer); } + private static final MethodHandle FREE_CUSTOM_IO_WRITER = + getDowncallHandle("faiss_CustomIOWriter_free", JAVA_INT, ADDRESS); + + public static void freeCustomIOWriter(MemorySegment customIOWriterPointer) { + callAndHandleError(FREE_CUSTOM_IO_WRITER, customIOWriterPointer); + } + + private static final MethodHandle FREE_CUSTOM_IO_READER = + getDowncallHandle("faiss_CustomIOReader_free", JAVA_INT, ADDRESS); + + public static void freeCustomIOReader(MemorySegment customIOReaderPointer) { + callAndHandleError(FREE_CUSTOM_IO_READER, customIOReaderPointer); + } + private static final MethodHandle FREE_PARAMETER_SPACE = - getMethodHandle("faiss_ParameterSpace_free", JAVA_INT, ADDRESS); + getDowncallHandle("faiss_ParameterSpace_free", JAVA_INT, ADDRESS); private static void freeParameterSpace(MemorySegment parameterSpacePointer) { callAndHandleError(FREE_PARAMETER_SPACE, parameterSpacePointer); } private static final MethodHandle FREE_ID_SELECTOR_BITMAP = - getMethodHandle("faiss_IDSelectorBitmap_free", JAVA_INT, ADDRESS); + getDowncallHandle("faiss_IDSelectorBitmap_free", JAVA_INT, ADDRESS); private static void freeIDSelectorBitmap(MemorySegment idSelectorBitmapPointer) { callAndHandleError(FREE_ID_SELECTOR_BITMAP, idSelectorBitmapPointer); } private static final MethodHandle FREE_SEARCH_PARAMETERS = - getMethodHandle("faiss_SearchParameters_free", JAVA_INT, ADDRESS); + getDowncallHandle("faiss_SearchParameters_free", JAVA_INT, ADDRESS); private static void freeSearchParameters(MemorySegment searchParametersPointer) { callAndHandleError(FREE_SEARCH_PARAMETERS, searchParametersPointer); } private static final MethodHandle INDEX_FACTORY = - getMethodHandle("faiss_index_factory", JAVA_INT, ADDRESS, JAVA_INT, ADDRESS, JAVA_INT); + getDowncallHandle("faiss_index_factory", JAVA_INT, ADDRESS, JAVA_INT, ADDRESS, JAVA_INT); private static final MethodHandle PARAMETER_SPACE_NEW = - getMethodHandle("faiss_ParameterSpace_new", JAVA_INT, ADDRESS); + getDowncallHandle("faiss_ParameterSpace_new", JAVA_INT, ADDRESS); private static final MethodHandle SET_INDEX_PARAMETERS = - getMethodHandle( + getDowncallHandle( "faiss_ParameterSpace_set_index_parameters", JAVA_INT, ADDRESS, ADDRESS, ADDRESS); - // TODO: Requires https://github.com/facebookresearch/faiss/pull/4158 private static final MethodHandle ID_SELECTOR_BITMAP_NEW = - getMethodHandle("faiss_IDSelectorBitmap_new", JAVA_INT, ADDRESS, JAVA_LONG, ADDRESS); + getDowncallHandle("faiss_IDSelectorBitmap_new", JAVA_INT, ADDRESS, JAVA_LONG, ADDRESS); private static final MethodHandle SEARCH_PARAMETERS_NEW = - getMethodHandle("faiss_SearchParameters_new", JAVA_INT, ADDRESS, ADDRESS); + getDowncallHandle("faiss_SearchParameters_new", JAVA_INT, ADDRESS, ADDRESS); private static final MethodHandle INDEX_IS_TRAINED = - getMethodHandle("faiss_Index_is_trained", JAVA_INT, ADDRESS); + getDowncallHandle("faiss_Index_is_trained", JAVA_INT, ADDRESS); private static final MethodHandle INDEX_TRAIN = - getMethodHandle("faiss_Index_train", JAVA_INT, ADDRESS, JAVA_LONG, ADDRESS); + getDowncallHandle("faiss_Index_train", JAVA_INT, ADDRESS, JAVA_LONG, ADDRESS); private static final MethodHandle INDEX_ADD_WITH_IDS = - getMethodHandle("faiss_Index_add_with_ids", JAVA_INT, ADDRESS, JAVA_LONG, ADDRESS, ADDRESS); + getDowncallHandle("faiss_Index_add_with_ids", JAVA_INT, ADDRESS, JAVA_LONG, ADDRESS, ADDRESS); public static MemorySegment createIndex( String description, @@ -200,32 +231,112 @@ public static MemorySegment createIndex( } } - private static final MethodHandle INDEX_WRITE = - getMethodHandle("faiss_write_index_fname", JAVA_INT, ADDRESS, ADDRESS); + private static int writeBytes( + IndexOutput output, MemorySegment inputPointer, int itemSize, int numItems) + throws IOException { + // TODO: Can we avoid copying to heap? + byte[] bytes = + new byte[(int) (Integer.toUnsignedLong(itemSize) * Integer.toUnsignedLong(numItems))]; + inputPointer.reinterpret(bytes.length).asByteBuffer().order(ByteOrder.nativeOrder()).get(bytes); + output.writeBytes(bytes, 0, bytes.length); + return numItems; + } + + private static int readBytes( + IndexInput input, MemorySegment outputPointer, int itemSize, int numItems) + throws IOException { + // TODO: Can we avoid copying to heap? + byte[] bytes = + new byte[(int) (Integer.toUnsignedLong(itemSize) * Integer.toUnsignedLong(numItems))]; + input.readBytes(bytes, 0, bytes.length); + outputPointer + .reinterpret(bytes.length) + .asByteBuffer() + .order(ByteOrder.nativeOrder()) + .put(bytes); + return numItems; + } + + private static final MethodHandle WRITE_BYTES_HANDLE; + private static final MethodHandle READ_BYTES_HANDLE; + + static { + try { + WRITE_BYTES_HANDLE = + MethodHandles.lookup() + .findStatic( + LibFaissC.class, + "writeBytes", + MethodType.methodType( + int.class, IndexOutput.class, MemorySegment.class, int.class, int.class)); + + READ_BYTES_HANDLE = + MethodHandles.lookup() + .findStatic( + LibFaissC.class, + "readBytes", + MethodType.methodType( + int.class, IndexInput.class, MemorySegment.class, int.class, int.class)); + } catch (NoSuchMethodException | IllegalAccessException e) { + throw new RuntimeException(e); + } + } + + private static final MethodHandle CUSTOM_IO_WRITER_NEW = + getDowncallHandle("faiss_CustomIOWriter_new", JAVA_INT, ADDRESS, ADDRESS); - public static void indexWrite(MemorySegment indexPointer, String fileName) { + private static final MethodHandle WRITE_INDEX_CUSTOM = + getDowncallHandle("faiss_write_index_custom", JAVA_INT, ADDRESS, ADDRESS, JAVA_INT); + + public static void indexWrite(MemorySegment indexPointer, IndexOutput output, int ioFlags) { try (Arena temp = Arena.ofConfined()) { - callAndHandleError(INDEX_WRITE, indexPointer, temp.allocateFrom(fileName)); + MethodHandle writerHandle = WRITE_BYTES_HANDLE.bindTo(output); + MemorySegment writerStub = + getUpcallStub(temp, writerHandle, JAVA_INT, ADDRESS, JAVA_INT, JAVA_INT); + + MemorySegment pointer = temp.allocate(ADDRESS); + callAndHandleError(CUSTOM_IO_WRITER_NEW, pointer, writerStub); + MemorySegment customIOWriterPointer = + pointer + .get(ADDRESS, 0) + // Ensure timely cleanup + .reinterpret(temp, LibFaissC::freeCustomIOWriter); + + callAndHandleError(WRITE_INDEX_CUSTOM, indexPointer, customIOWriterPointer, ioFlags); } } - private static final MethodHandle INDEX_READ = - getMethodHandle("faiss_read_index_fname", JAVA_INT, ADDRESS, JAVA_INT, ADDRESS); + private static final MethodHandle CUSTOM_IO_READER_NEW = + getDowncallHandle("faiss_CustomIOReader_new", JAVA_INT, ADDRESS, ADDRESS); - public static MemorySegment indexRead(String fileName, int ioFlags) { + private static final MethodHandle READ_INDEX_CUSTOM = + getDowncallHandle("faiss_read_index_custom", JAVA_INT, ADDRESS, JAVA_INT, ADDRESS); + + public static MemorySegment indexRead(IndexInput input, int ioFlags) { try (Arena temp = Arena.ofConfined()) { + MethodHandle readerHandle = READ_BYTES_HANDLE.bindTo(input); + MemorySegment readerStub = + getUpcallStub(temp, readerHandle, JAVA_INT, ADDRESS, JAVA_INT, JAVA_INT); + MemorySegment pointer = temp.allocate(ADDRESS); - callAndHandleError(INDEX_READ, temp.allocateFrom(fileName), ioFlags, pointer); + callAndHandleError(CUSTOM_IO_READER_NEW, pointer, readerStub); + MemorySegment customIOReaderPointer = + pointer + .get(ADDRESS, 0) + // Ensure timely cleanup + .reinterpret(temp, LibFaissC::freeCustomIOReader); + + callAndHandleError(READ_INDEX_CUSTOM, customIOReaderPointer, ioFlags, pointer); return pointer.get(ADDRESS, 0); } } private static final MethodHandle INDEX_SEARCH = - getMethodHandle( + getDowncallHandle( "faiss_Index_search", JAVA_INT, ADDRESS, JAVA_LONG, ADDRESS, JAVA_LONG, ADDRESS, ADDRESS); private static final MethodHandle INDEX_SEARCH_WITH_PARAMS = - getMethodHandle( + getDowncallHandle( "faiss_Index_search_with_params", JAVA_INT, ADDRESS, @@ -284,7 +395,6 @@ public static void indexSearch( .reinterpret(temp, LibFaissC::freeSearchParameters); // Search with runtime filters - // TODO: Requires https://github.com/facebookresearch/faiss/pull/4167 callAndHandleError( INDEX_SEARCH_WITH_PARAMS, localIndex, @@ -312,7 +422,7 @@ public static void indexSearch( } private static final MethodHandle GET_LAST_ERROR = - getMethodHandle("faiss_get_last_error", ADDRESS); + getDowncallHandle("faiss_get_last_error", ADDRESS); private static int callAndGetInt(MethodHandle handle, Object... args) { try { From 7b22615e5392b16693f71dc4cb13024f0af92a25 Mon Sep 17 00:00:00 2001 From: Kaival Parikh Date: Mon, 3 Mar 2025 09:19:04 +0000 Subject: [PATCH 07/17] Misc changes - Scale Faiss distances to Lucene scores - Refactor classes for Java 23 minimum version - Add javadocs - Add tests and fix failing edge-cases --- gradle/generation/extract-jdk-apis.gradle | 16 +-- .../extract-jdk-apis/ExtractJdkApis.java | 3 +- gradle/generation/regenerate.gradle | 1 - gradle/java/core-mrjar.gradle | 5 +- gradle/testing/defaults-tests.gradle | 1 + .../randomization/policies/tests.policy | 3 + lucene/sandbox/src/generated/jdk/jdk22.apijar | Bin 17070 -> 0 bytes lucene/sandbox/src/java/module-info.java | 3 +- .../codecs/faiss/FaissKnnVectorsFormat.java | 16 ++- .../faiss/FaissKnnVectorsFormatProvider.java | 91 ---------------- .../codecs/faiss/FaissKnnVectorsReader.java | 36 ++++--- .../codecs/faiss/FaissKnnVectorsWriter.java | 20 ++-- .../sandbox/codecs/faiss/LibFaissC.java | 61 ++++++++--- .../sandbox/codecs/faiss/package-info.java | 6 +- .../org.apache.lucene.codecs.KnnVectorsFormat | 2 +- .../faiss/TestFaissKnnVectorsFormat.java | 99 ++++++++++++++++++ 16 files changed, 201 insertions(+), 162 deletions(-) delete mode 100644 lucene/sandbox/src/generated/jdk/jdk22.apijar rename lucene/sandbox/src/{java22 => java}/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java (82%) delete mode 100644 lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormatProvider.java rename lucene/sandbox/src/{java22 => java}/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java (86%) rename lucene/sandbox/src/{java22 => java}/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java (96%) rename lucene/sandbox/src/{java22 => java}/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java (89%) create mode 100644 lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/faiss/TestFaissKnnVectorsFormat.java diff --git a/gradle/generation/extract-jdk-apis.gradle b/gradle/generation/extract-jdk-apis.gradle index 2e181811a9b2..3adde87da838 100644 --- a/gradle/generation/extract-jdk-apis.gradle +++ b/gradle/generation/extract-jdk-apis.gradle @@ -18,23 +18,9 @@ def resources = scriptResources(buildscript) configure(project(":lucene:core")) { - ext { - mrjarJavaVersions = [ 21 ] - } -} - -configure(project(":lucene:sandbox")) { - ext { - mrjarJavaVersions = [ 22 ] - } -} - -configure([ - project(":lucene:core"), - project(":lucene:sandbox") -]) { ext { apijars = layout.projectDirectory.dir("src/generated/jdk") + mrjarJavaVersions = [ 21 ] } configurations { diff --git a/gradle/generation/extract-jdk-apis/ExtractJdkApis.java b/gradle/generation/extract-jdk-apis/ExtractJdkApis.java index d2d60d2545aa..c84c8f16996d 100644 --- a/gradle/generation/extract-jdk-apis/ExtractJdkApis.java +++ b/gradle/generation/extract-jdk-apis/ExtractJdkApis.java @@ -54,8 +54,7 @@ public final class ExtractJdkApis { private static final String PATTERN_VECTOR_VM_INTERNALS = "java.base/jdk/internal/vm/vector/VectorSupport{,$Vector,$VectorMask,$VectorPayload,$VectorShuffle}"; static final Map> CLASSFILE_PATTERNS = Map.of( - 21, List.of(PATTERN_PANAMA_FOREIGN, PATTERN_VECTOR_VM_INTERNALS, PATTERN_VECTOR_INCUBATOR), - 22, List.of(PATTERN_PANAMA_FOREIGN) + 21, List.of(PATTERN_PANAMA_FOREIGN, PATTERN_VECTOR_VM_INTERNALS, PATTERN_VECTOR_INCUBATOR) ); public static void main(String... args) throws IOException { diff --git a/gradle/generation/regenerate.gradle b/gradle/generation/regenerate.gradle index 8edaabc77d80..d23cfd7d54f0 100644 --- a/gradle/generation/regenerate.gradle +++ b/gradle/generation/regenerate.gradle @@ -91,7 +91,6 @@ configure([ project(":lucene:queryparser"), project(":lucene:expressions"), project(":lucene:test-framework"), - project(":lucene:sandbox"), ]) { task regenerate() { description "Rerun any code or static data generation tasks." diff --git a/gradle/java/core-mrjar.gradle b/gradle/java/core-mrjar.gradle index cc3ac76fba88..b8b116800e01 100644 --- a/gradle/java/core-mrjar.gradle +++ b/gradle/java/core-mrjar.gradle @@ -17,10 +17,7 @@ // Produce an MR-JAR with Java 19+ foreign and vector implementations -configure([ - project(":lucene:core"), - project(":lucene:sandbox") -]) { +configure(project(":lucene:core")) { plugins.withType(JavaPlugin) { mrjarJavaVersions.each { jdkVersion -> sourceSets.create("main${jdkVersion}") { diff --git a/gradle/testing/defaults-tests.gradle b/gradle/testing/defaults-tests.gradle index 14e64647d667..1af4b88c4714 100644 --- a/gradle/testing/defaults-tests.gradle +++ b/gradle/testing/defaults-tests.gradle @@ -142,6 +142,7 @@ allprojects { ':lucene:core', ':lucene:codecs', ":lucene:distribution.tests", + ':lucene:sandbox', ":lucene:test-framework" ] ? 'ALL-UNNAMED' : 'org.apache.lucene.core') diff --git a/gradle/testing/randomization/policies/tests.policy b/gradle/testing/randomization/policies/tests.policy index f8e09ba03661..827d22e92353 100644 --- a/gradle/testing/randomization/policies/tests.policy +++ b/gradle/testing/randomization/policies/tests.policy @@ -80,6 +80,9 @@ grant { permission java.io.FilePermission "${hunspell.corpora}${/}-", "read"; permission java.io.FilePermission "${hunspell.dictionaries}", "read"; permission java.io.FilePermission "${hunspell.dictionaries}${/}-", "read"; + + // allow loading specific library for the Faiss codec + permission java.lang.RuntimePermission "loadLibrary.faiss_c"; }; // Permissions for jacoco code coverage diff --git a/lucene/sandbox/src/generated/jdk/jdk22.apijar b/lucene/sandbox/src/generated/jdk/jdk22.apijar deleted file mode 100644 index ae4b063641ba5a89f9b74a85848424dd9eba73b3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 17070 zcmbVzWmFx=(k&9)U4y&3ySux)yF-Encb5=?yNBQsIJi531b4TfK_ATh-ZvSB$=vsv zwa!|HA5~}d?%Gwot6D)C^c5-)BqSsd5D=ltL!dK*JKb9ys-CsRvv zJ90#a-#b2}Gk7mrYEA;4w7QhPihavd=@CO{7;e=S2*GWI@-}yf5FWUatyNt;M zLhOJ6FQ;r<_g}X{jzozv%6Z2S&VoI;_L-!X)d04|TG5xI8_D#DVfGH6ca5o=`j{?Y zW^!AbAW^iu;kGnh&OzCR>(251d%~5_*Q}oQ%7L#2K>#g)AK-_(;VAzpCKm3`);%c* zH5gV-zWs%XOU8idp?7v$bhd5KpW_(u&CrD<2N|gbv(|je-c^`b=!+L&we(A?5kWDr zr&Oraq%^+W@OtN#VLWy&-5`%i?v-q;8j0W)bUMb7%mOKxA|2-CHvmaZ7Z+Hb!{Ceu zR%l_dCXP<5#Bv%%o2*$M)O%2RCs!UT_|Op^xF71H-E@3dhqr{%r0;{w9vmh@PaQ2C z_nrE6OwMM}k7D#jH_vZD7uB0op?az|hxj&IEUIc<9%PZ2Qm{!bWApHk%WTtU zaZ~b%LJLXrn0fh|7*n6BTy_IoIMU2hFlpL}7&LP96^!FBH9)gw{ai74XB`|-(&+2} z-b(e7-Z6c);RrhSh80Wx;x&uew;((hLaUiT3T2%Y5pW*6t7J7gXUm!yOM|XA(cXuP z0>1D~74dt(yymWS55Nl25RmX)9mN;GKtS|hKtKxrc%ojve4?C8?F^rtBv-|LIS@fa z?{22f0wTJi1`=>&4y4fxGM&T*T#1%TSYG119k$8bu(nMA!f|dSD`*%YY{;&Om}MHLog^}Sn+?f|5*-O|t|bSBh9ajYnx1de$SHksz2=zl z^*dsjX5AnXc;g7?Tt2U$m$1+$eh)&H!`kE%2`KA)(y7cm*@n@SPp*()i5$2pSA{=Q zXT>8vQ^BiwG<4*mN(UO(F!KN`gUz=cTK$}Tv?E!2xO(9)##~AomRnx&_2WNbu{k8v z!|{lP9LVpmpn4GtF;_cd7fX9P5mRSlCrbwxd#7i>lq>SegD_(HghS>~K)%Zii|9lO z(Z-cwAS3As^*88h;7DqI7nIw2=0y@2eNPJE>&HK!5iRl#(J)Imv&B9$rv6~&vX|Ts zS`Kog$wk06COmo}Is7EL%3gTIg03452DjUrRuwsrfZLEwOLudd$Ptun;TF7mI)m4J`si?-Ww!z1(tdD{WXG)PNXFc+z-5V3@4yZh!)`Wd~tk4bQ$z_ zh!VeusJN58tHaA|9X~ABA%Ge*N`S5a0J*1QQDirr4%%lRb2T>Fm7ll*0*!W+@a=Oc zS((_>6Oq z<5M!qL1aUk)yk_gX4E({j#|6q0P~FkeQr}l>eOrM!)pj@kEwMo)f%fy!6F4K+Kd(# z4rq_~&@O|bENJk6MfGy2`T*LJi{g?jNmrOTpyDvMiq|f{lY|TX(_MBIwEIY%4g!UO zHm`A(o{%6@y`X1!M56Auv4!^{5;B%{)~3(nsaAQ|?lEO%*a{0x)a2b_gi{f|+UZ9y z2t-9gl%Nx>?FyRwo)IYf_4BwFyK1zKC){FA2%i9ci4kt$7?F58u>M!()8V&ELn9gb z=^4Q7KpBu89LdaB7LvIJ7BnSDU~xH6-_ap~TlL3tO(Kh}0Zo3bs+^tpd#{EkeD00PSNq~hjvODGGqFpwsak70+mXRqku^|4Y$dn+Ysf1=OFdG zw|wA*z+jF5yzSd%-en^wt-8avA|I%GrIBt(${A#Vt;hU+SPS8BRRND(LF>-G@>$$J zaCjRrsx9qX`LzwM-uhMs${k~iw_)_;^+hEBR3k(jhJ;_YZf=;3$E^8Nm3xXWyYFL8 zgn+c-zhK{bJUbSzeiy|rS6%-(J0$WBPvrcJlB1L)+x{e*3p6Th#j zHTaLwUQ)3r)z)Y$HxWlEA8Dak!S^uylq_MNoA`_i3LCVPhbs(wW81byWt%@vbVU$a z;6+S&5qIiZvgiT|pP~0QPsU_4NU=kGG27B{9jk6Kj8klT`1VWB1>6S zym9*NwK~>(jgOe-1{9S+IN>E9qX`O#=cRgg)%j1T#4m8%6TER3TxT&s7od_G!G`3A zPD9M74+T!B6jUq9Q`7ZY=}xYN7o($20#ZlXz;8AN={tg1rRw1)JY?HE&)-_D(nZpC ziv1*~Z0yO9<;WZI#6RcUh+xsoRV~jienk{+A_88>RNZSR;?Hb8F;8KgzZCjGPpFT{ zgduds7}1l)IOy>3Q#g?!9%UlFvNc*~!pv^Syml6($?}&DdvP5}Z9|A|#pijF|aOPldKQW6gCiiIE{Vj*MKN z2c5L~f~hS1=>7awb+vJ52}9UYFT>IZeduTfgEUbnnsKV7dFv4g6#6;D1yy#Xr;HwA54Wg`qNE6Hq>H@39;^pL>WdB z?w(>@N4Qc#U#kPgO7p8>krLD`8T0dbuNlt3)2AWsr{X0J3_xnQQnB!Mth7Gjbd!FpV!##2b(vb4hYKL8I-eu`-Zb{vV4)|Wr)Maa(M zuxmO#4%De$%=xCkPW;cRNY&>N(wrSEO%4l^u4t+o!mNG@Dq zyeks%)X*Q))^0i~c9p`N0JO<(WBm1s0`fIdS-YvUW6MaBuDu#?N{kYvj#lAB#@H zoS3-B`s{o|q5pA@t1}IVrbTM?o4pCYhA9B%g!BEU>D;;!G0bS+qa5QTz_zA$=Sn<} zT@xW2C>dph;xu4vI-7a@fng_nq+Q<>E-HrrP7oRz77+CCz%NN;@7?I2= zI#YEGv;=LspFvHCcLqhocN-!;Larv5HCUsuHM~f$ovBWr_}tm^w;Y;X)E*v0lco$Y ztfje>_;|s(=1}R@sx`Fh`b8Zjg?or%xOR16<`VW;Uo0`9XU%9%sFN!E) zMy};#O%WtW4iOO;^5VO2BJlAR#C|O7eZe5~aoK9g3Z27_g?!{!OQI-COKlOpSHQOl z_vs5C^5F7X^MA-^3@v_U9vvPTIlMbP()R@wz`ma{7)91LlZ^OwZZhoHs9B$|gdS}bE#UHUhyMFIJe5d$c`Ft6ky!|M@p-_E5jqPPDjT> zV{DOrh1f(}26kv|Q=Sw30B=KOXrkUZ#njtU0vdZx*g_u4AW30Cjqy)=iXK|fC=aY0 zh-{==HG}A6SDJ8SH_6}-!s^|QG$Z>I`_lw$46LC;dQ|AFv)Zu5X{cW?O4W1^J+ULZ zQMiKb9lj1QAaovul7m3lF4e^6C);yrC~|C#^+6OyT&mga!p0Z2CYu^DGA#i}xzO~L zm5efs@4S`z2{{?6X^ux zDb=N;qb@ViWM{T2HD2JT!P1pH=m?5W0q-i2GDAc2)g)OH-5t*EO@TdKMEU_1SvW`1 zI&Lo-Gc{i&(b_;!zFhOXS+T0Lj`bYcC-HdA-izENSK#r!LZ<)Dk^pn3tnhT2&=^=9?aBS){pr zWE<KZ|y4R^|9O@=-nNPcDNhT|%Iq-P6Z>X(4$T@BJa{zmjA2gK_ zkJKU|AGSf{BS&Dt!G|Pl$bx?hj<L}`mJ!>FL)*arV_5Wuj>^=;|W2#xBh1FyFS$G~bmGPQoywO3(=AY9{i zEf=p01+q|vtJcRdJ+!hUGK)&ZihXyg$nK&SBk2O+M|C)laGbir`UaY(R4?TZC#n`q zvm)Q-$k4@XK;yCQmy2$L_OWj0o)G!14YT-fb!*I?2J50I6tl^e5dpV$9#iMix`HaH zpv9hLbaVtI-eU3wnOLV?s6n=PGX0?p;%KvgdXS#>&quSM#~30&({2Ui)l%Ol1_JxP@8GH+3MoPLQ8>E(@31-RgybN z1x*)j1>SrnKB^Lw-?oQMl8aYAM)C0$5L9jep7a4!{P@ofK%@(%J?F$7vYY<7}bm|}@f#4qqK!CO} z%nbPNwL_OvaByD^*T6-J+sslH6I zQW{tYsh^N4G_|aL#vGI(NBl)@&3p|0wvF+}&kq4xJ4mCg-p+np z#^e89{;*%+4qNI(NbJNHUq3Ir>5)2SUd6l(ZsYo{T!n@AK9tZHg(tiL?4TE^o{-PK z0o0Lvm9DE0F;j3pssZAJvug|KlJGwIo(R(v`=fUNUU&K3O|8V6tQVe;^)V0_W_!p7vUCV zNh-tuz#OPXA5kR~7`ksyuQpG9p5Fr-1VgRlB;&ldvMf3KR#!%qu;^$!SD#)Q9K}Bh z(`*Dx6zMw_B03y|5ejMkK!doAM7Q^YRJF(1I`6)dv#_i`EeRblaPfjjkb-^E3MI$F z>s&~#5ibyPKU0vwenNQldc~kLgkzTr6M|o)mkg5x6gQ+LLjGM(_TmHhQ$==o%1UzZ zsP5}Mrt*K*XkSj_3Wg>omUiYZUP+?jnxF2U!4IIMmQX2)hXjhj3|TY`uFyCX0{MLY zbE&Zg+gxqZ zxLB}k`SY#M@h+Txbarb?*gxXu_XQRJQ^TBJT7#H}Z`hx;MT2f5J}B6B35ui^%;Lc- z9Rukx_+hf}btL212xf#a)iyoVns=xBOFU1}phlqWAyf!pLT|71fE@gGjYI_Zx_WZ| z;HZ>=PSxELGt8qig%$={Qm&B?9Th9`oRf(HPOW-4Aj9HVcsfE_`(V0LLxZ8mV6%;y zwxK;MmX5rNj8lW5ST~+A;3rSYW<|v^H0YoV=FkS`N`aQcM?feZa?TV;_qeH+_&ACX zD;y~|`ux{v5@T=}Pde+8dX(j!5he<}Va&)=6}}ww61sp5r%a`Z1Q&FifT<5Tr5(b# z0*MGs=hiD(*3OiwLnztbSkb(4=s@+!q~-=7$#y+$N|ge#~lPCmJ0Ml*LTjHXGd6fK-W z3hJW{x>Pzor{$J-|A{{N0p5y%D|17h9YMM%l|f3Lfsj*EqmGh0l1j>1fV35)P`;&b zLtZq2N(pxd>FZsxMp7LmdDNK2YuFOasU5FeZs-*)I+)@SrS~;9Uu5=z`YDciZ@=K` z$6>y`ch7mSzL<2UCCQvc5S@GY+RJ;FPj1{sU3b^l`?(0$7nt~V>d?w5@d$J%X;&9prlB^&RlC8r1>Aite(lNp4n_N#O@ogO1LkKSLg;D0Cx_2Fo;F#+bl;&z7=P*aD(iF+X0ryPSI3~ z$PUu3!BBOiO2S2fbfK0o4;V-A6Zlo#t~a6Lnvp=aU{6spKK{_&`xqh4Fu!9UvVVvY zM^{rjW7C%@Na?9vgjzA=L3GXTEo!xmlPqkD0v3VL zeTwVMwxe<;8RVPE?>KI}zQ^S{B(9syyKqs6+!L z&Ws`trKH%Ry|PQTwTUmN`?GF8aqCB)2%Z;r4W-C*w}4~!_`Zcr$I5;f`Qnky2oOs3 zD(_82d7<5Or`RNtU^@WW5Lgm1`HWtg9BNPk6q~uPC$%%YN4<;)e+I0qTcgV>5{LNS ztZPu6#qRN2og2yiHtP?0?>}yOU`w(nB9AmS@LL*tS+r7iadI_w`3HE0$;fcezM|+wg8Rnv0g@{q>BicXI!ir|f!|WBy|ai=>ohfI=4M{nH(OfN z*8`rNCC3G~>7fA-je3Qm9KMNTjhEb%3fXWZV?_j{TOalSyQ4KraaC&B*%fzaSA21FTQx(^|S3T)&LdC>IQ29 zpQXwLW>A!_k#@?mk64lePlw&%BBd04Q7|kz*{#TW1Y57}@H%%L4q=T%pW>Uq)LE_X ztk)$ElsDy1lf)=pz(iu9Z3na#LCrMzr1q=VdwxQoIHYz)FbuXqbEMb|(o^H*S0BTV zRBtP!i1v$X`wrgzp%hBt3({jhZaU!qXVX1xjqGh??Cq^x9iG)U`O3O669TBMoz*HA zow{wh{K;tF;P_a3#3P~NF~dZM;;;B>>i64Sf`?=zudMm5KyMX$w6kF>SVcb6I3M-D ze=|DIe0Om-s19V&$r&)onCYnR#QMY7Rv|KxKb2gV7POpvX9~1u*`s?0p%;@2!3}fj z9oj9OYBq|N<2O}!*&$&YMvj{|2R+gVRInrJG}&fe?|cvzlA;%#(Iji4bo!#;K8dOz zTwto8zXh-C84t7|2{F-y**44>e(U!&e%_8LDN4GAiNYV48|?#(m1vwz1U|g^eHLN_lGUnJL!cOC;z%BYZ|sWSM$-HB?V&QA zXJVF9IlD@E9$Y~EFvww3p=SU;iiMfdDWvS7a%*FEMudTJHMp_l<=Bw}JqX@R5mX`Y#71W+uam}nXh^Oz8=9{xGp2$4 z=kvj_(sc?3!d~&iPzNK=Nm=$X`m7=CauYV=VzrmM8$Kn(*-r=Vf#Hu#0A6mwn zhX5a?pay`3<8?rFt9rbOp)6I_$v*Fw-=|Ec`^0Nxu3;ONiMGsroc_giS2f6+BdZCI zCgXzxZ2f?0gYdYn)+t?c4cj)Y^8Vc)=SgPhKKlIeJbn0WpW)?hp{m_uf8ifuxb?Bh z!{iaR#O{!n@{a=6c!$Z<{*{sj64fU!9Kt~x1sAta3Rgid${%7m8&M9DMU-}U_4A1P zpb~9m~Rx8x2jGmCFnh%%7O$A`)-6hP4thORRS;3C1xo+=i?@|?o+z+jO@SM|N zu=~gOr3q_Lvf=3qCKjAb%(s1qRwFR>^=1! zX7i&{$I*FK--pvJeIS>bSBL2u=4$=5Ex8lf__HH%>fb-lSJ-N7`PAnMLax&b4fTY1 z3mm!nzZYDEY2JX?;@u2wzk5K-WJehEsbBey&Eo$vPowiPn#BYRJ)USrcgYBN4a8Y8 zTQ$a?UD@Jm%$;AheIbR*!7Mb3OkSuC?@`rHN`Hv3xGzWFlM7dp+ZO-^t>0kx-J032 z-Ct#-v&c(F&wK(Mb-Hn^fDW~;r^NeITE3(MHwn^RqXTyYGv7i&P{ko)#_7j!5ZOKp zhK>mbaEKC7hDJLoo-m^Gx4}nHFW?zhIQv5NIYjk6G9(QgVWAu4)Eg28JZDG?BxDbs zK%yCrsT&nmEJ1&ybKeHDjW_7wylTT2RaR>6R?<}Ijf*ZtSm)e}%+QcE@Prs!ScLj3 z!JHgp1PUN1D^mvNlDu>czDjtQt8vuRddn~Ad)F$jbi0b)E|*Fx(UH6_P#m`gK}hgi zxj#QGx80qy{LxQKY$+4B`(g6}-abksS&MXnDO{~Qt|nGuwdZV*dFZ=X+K+X;vq5ck z8D3S--D61{^_uF&W-EuQZV^3?b`m)e@|HI3$*5hiawYWC-BH#kU6&cdlMD%o?2*Dl)?l>&toI4`0jB zFTLwhhsNN`2pmjCilxV`E2ix#y9Qd1`Ml3$Dq>~OHZ>zN6d3w1|Y5AlXhyP^Kk2c9 zy!FEe{;qx{`EBM(^PB!Lb*T%cE8I<&y#b{!D8vApA*koaD}`?}97-2((fy6lDEIgN zP%R+{4k1`a6Fw5!S&RD?=+0fIUG-rYq4$i5=CjE)&sD67aP=3VwkzH%)R zEh0Q(Ei$)YCn;T=tCgYZp_$0$p>2eGro0IvSWHeT{U~Uu3;At+G!JOo{il-ANLVd5 z^iOQyIPJVy^C*;y{!c@YNBSMi}2U?MLKVdl_Bdr~;7Go)`y-Bs8} zxv3Dhsbf1~cm{hIhytSLeC!hERbsVma1>($8-=if1LmhG&_A{3%a z7<%LxR)a>MxNuzeP*_`!nT2xRj{+su-@Tce={R?37B}@hScY(6S$E){D4IwaBTk5q0OayD33M3rp^X2N*g8UFr-e-y6G$F1j*iAnZ z|HUm~Sp&i;O`6~|fYRmLMjZVer_yzvXjZR+;;)#h;i5*NMA4Q;Gn9B;Y!U%{IhjxJ zHiCE+zj-8ksNW{nmxrsK!RzVrT$1{jF!&#Af%9}WHg(5CX$DGQT!`()qA*{fAYZD2 z)6^+XJ|P=~uq=!vFAW-uSZsIgKJxaVQKm3c{|Fx_jl-xY`b=Gy*_Ck$ysk4Ud+gBF zd1&ss3vbZM-GMVs2iUl8wEwU(lwS%=G5ANN512LVx#EK+bKrXP*f5sqb-Vm5TuE&i zhO&PLVrB)_R23F#9;VVM3v3v>FrTYSM0I-Uf#+x59ZaLyqT0@(B?E~!o)k>7`6by= zV58fu?6O8QbR@#EtxyN@UiTG~TkxHt4E_u@!*ds%E9OOIG|~!pM};C4YpM`JL*IOx z$d~$m)Y&(hawlM^YJLZRxL`XqfN2+yzH+aSS#WKXb}#AF^|Wn1Bb9ca{{&0v*~aeU zkq+V*3~7{gBh9bL)f3yzC6Fe(I0F*?CAkWC{X@;-8EXWszVqxwZQw@(U-r-=uz5by z=vBWA(Gs+5seTV+sJuABIvGcLH;LWzxT`;uvD99@NS8-YPk!6r`forLwlI7?;qxb` z`8r7v?xMui6s2@yQ4NCZMu;*J^9T$EpFXC!jU8Ljy#>M_@X*3$?6Gj2u(6j;8BE({LI^m%oY;Nc^HAMG_ zpLsXZoaT#ab00D7yL!Y_K7TJR3T(-PPdUdD$rz>>)9PB)#CPD*eIYxGH$4bNVv_MT zQIO*rHc5XFS>@p-xzF`#PAHV~r~KtQ_w|`M(l|mHq=&voOLCG(Q)#p8_xx;94EnvGx<5 zykod*AiCXVS)?{WNA;aoa(xF%n;aM0nt57`C5cjbu_EA_MuVuA#y5FB`2Frrq3R%$ zQ0)4zsM7owOe|vWYGm{GV0|2)Q$P$F1<&##TbA#e%FK;U?8;mr5!Bd zpxl!F|AuQJm_H)XasfH--R97c?ale!2~<|7cko*DML}^e*>|hK>c&O%-2^iiPt83| zT~8I;1+9f902)JZS7`ZH?iNC{D=a2oG71f&KKC^G5;~IlATwCXPS74h^Vv$1FWAti z!DE%3u^-q)0ZhU!JmF4@u0M6gWd<6DlEjH;^2#IJR`Ph3I8iZ)?2tY0swU?_@+Irf zn#euJ2?o}Mn9?^6F(e?Hp;VjuZG%ezctZ6m>{FjHzL9W$W)<9u$V69tJ;a4>^yA@u z<}Q3;%w4vKhiix|lLdLQ;*uBApmNxJi_Hefc!1coY zKKm1_qOEVIC~;wl10i#|%=+A0d6nFw$Nm^E=yv|P;q?eE@Naqe<=((wd05QG-thTk z!N&w?`9DS@LzNKmdEWA`QB&GAA*y$QrD3u{l93&XV^wi{Oxwv?scd{wTO;f9!x_a& zKV+PpHE%6)+v@9OW&lrBjZ{r;Ak4?yu?HyT>#zjLe>}UEr)>5~8N#r@#WvTv={iD0Zi()qqvAVVIR)0D7`XCFr~;gzS*xK#6n<_|v` z%&{rjY%KUy(!@P(Lo<^H!>D0>9F&^N%Y>YgsHKs>)VqgBc~|YF7;icr{Y(C$VSDl` z5LINl%Y2(v70T=w5(E&WG52Xif6fz7uatb4Y~C5n@uq#=vv`U>jJni=a&_e2(>$P}$Gr-fc8mGE1wK*9^Nfx=)z=ipK&Z#GNmrh;vCxjkUF zN(Pc!CX-0&kBI@B)jo$_B%aemU7J?#f9Q^R%*U`V(79t#C7cB9Z2O3sLg-~R)KCg; z`SKG=y{6@cGopK7=Gl;n=>4))w-#NX;<)n4c}%o@W7DWV4ROBp6YS_3p0_;QVZU~7 z?q#a)a)qr_e{;EEGc$M&EHz{Rp%lM5(NJs$wm_Pgm?b)AzGoTed#VL{3Ry#COKfea z)7cE3rT16OD0VSaAyx7-xaV5gawitJj3lunbpl_GUdLWLcYXOoRvcs>R&aiVbmzB_ z{x|tm#@^2Sna=fRe$6A$Yw%EKp^B*91z8BiamfV-bUk94jXw9EdiZ~c?LrVkLpOoJYVLj9> zC72R@&#g5Kir~EDgK1GyI$X#SQ9zaq`Aggm__(oIg3mlgvwDZK4C4$P(}D?4tx9{7 zcbtNmTRCkn^dz-4&aIe7Ze4?QVnc}5RGRSSG$11}h`Vv%8R;;>vrCq`F3drFwJ#b> zIAX{U>&#zKb~@Z*MQDCU2Q#vY%ivc^j|n!0CdEjj2U?&h{B02! zsxYk~Tpznd8{RGxo6uu|Eh4oMU8>K$lIwa*up}22v`Xc#R0j$^_Ys7UJ|SWUEkigh)4%*z7aWmo5I z_$kHQxwj7a6YZExC^h4|r)-Uk!5x*r*qc}bcO$umP~G_q2aKi=?e4QTR_e|fHgMPh z=-9AcQ?PI_usimGu>Ba)yz^=B=0)4*{VlSjwBx0TyKy5~LZ5sPHFL}2@miVo)b(J| zD!;$t)ESc^zji(IOT-_GeNBfo(n%V)f$d{vFj^&cGTQU%ru6dSeSiJ3CVwXL>P9 z8`H-!`02~DNR3jlm4BRF&B%wy&!-9}LKPI$s{rHajv*xp3f;*^BN9Wos-c~$c3NoO zw^hA`AiF2x|3j5-q)v3ZjP+^yW}flh)6DGh{`T%12Wa3pt^L;gI*9Atjoex!G@xbd z+ZNv39^Qliq|N(8hGn@%JSs_?5+c-uY7`2+qX=mv4aT)-OzuL7xmJalDMgS{vU6Hh za~Y>`?BxQeGSWO-g?t$&GDTtiH-oI$Q@6^=!B8MOgP{JOZ0Q=l6UJ7<;6rKnw7&u# zt+EAF*Qn`_+f7NUUu<4Cb8{l^o zko)AMG9GY>9oA&;fVmF_CH&wri=@wrEI2tRdbB$zrOCFZX)%wdH0Dw;#LK45ghVGV zZhJWWMRC_oWSfhLz=~lmXaj5FF{U|m89m3L+Q?uBFX-M#5I6@?74J&SyH~t}ZQL4X zFxgJ?=oJdO-jfObN)+h_Mu{2V*tZ(P7E{>lfKZOl0ChROmt_m=A99Rbj^u&(29I#! zRO3~_$|wJ!GvZWhY%^B5#n1s9MNeWY_zcXxmS7sp^g{40EE=mmWS*H=Qw_#zEmoGj zKI~agT@eYvHB>|0y&6ayZ1c%w8+a$O!PyxYxc4|B4(63(BRL<1Z{SS~_9jwp{^;MY zast>=9`#%OKc-cIK~RDId=vTW+qS3ah`+A?w-=HM(tox6>szg-*KvO?rN>|Mg6$XH z%00LF>uaH>ociZle9XUpxB0JczMeby^%2Yy2mHCJ9-sgH-N8RUL3(ca*C!HBk7)i} zT#vx~-SUf1YMvYYbrS!t__;lL_$Q+;zqNR7``0Pwr?%*y%j8l2`@8MmO!hxF{p-~2 z)0p(1%LM-4O#fzR`nj85hphg(JM@VE?B?Zx-siS|oj7_b1O8lv=>KZ_#i6a|9)6v> z`RlL0cpMu0`~LrKj_JAWU#AtGCU^c^N?0%1{+BtP=jMOywLgtK{JA`EUpD`n0f^_u zf9)y%^;}}%y=eTOoA=L6|Jq*stLbR`7ft_ji~70gUpr}kC6-0PmrVb2xAM8=Usd_P zT9zYz$?`7}!LuUfx#eF4;U`u9&!t2DFP8sN4&+Y$upZY5b4ov@U|AnjZa|gec^?%(Wah88~@X{sixrbltsK3S;A>03W c_%AOD3esSYqrX5v*pGj~k9&>G`Sk7o0Gg2#RsaA1 diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java index e7d675c36af8..ee9be3227de2 100644 --- a/lucene/sandbox/src/java/module-info.java +++ b/lucene/sandbox/src/java/module-info.java @@ -20,7 +20,6 @@ requires org.apache.lucene.core; requires org.apache.lucene.queries; requires org.apache.lucene.facet; - requires java.logging; exports org.apache.lucene.payloads; exports org.apache.lucene.sandbox.codecs.faiss; @@ -42,5 +41,5 @@ provides org.apache.lucene.codecs.PostingsFormat with org.apache.lucene.sandbox.codecs.idversion.IDVersionPostingsFormat; provides org.apache.lucene.codecs.KnnVectorsFormat with - org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormatProvider; + org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat; } diff --git a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java similarity index 82% rename from lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java index 1e7fabcd05bc..6796e0a9ecd8 100644 --- a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java @@ -27,6 +27,21 @@ import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; +/** + * A format which uses Faiss to create and + * search vector indexes, using {@link LibFaissC} to interact with the native library. + * + *

A separate Faiss index is created per-segment, and uses the following files: + * + *

    + *
  • .faissm (metadata file): stores field number, offset and length of actual + * Faiss index in data file. + *
  • .faissd (data file): stores concatenated Faiss indexes for all fields. + *
  • All files required by {@link Lucene99FlatVectorsFormat} for storing raw vectors. + *
+ * + * @lucene.experimental + */ public final class FaissKnnVectorsFormat extends KnnVectorsFormat { public static final String NAME = FaissKnnVectorsFormat.class.getSimpleName(); static final int VERSION_START = 0; @@ -40,7 +55,6 @@ public final class FaissKnnVectorsFormat extends KnnVectorsFormat { private final String indexParams; private final FlatVectorsFormat rawVectorsFormat; - @SuppressWarnings("unused") public FaissKnnVectorsFormat() { this("IDMap,HNSW32", "efConstruction=200"); } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormatProvider.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormatProvider.java deleted file mode 100644 index 52729ac1ba9d..000000000000 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormatProvider.java +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.sandbox.codecs.faiss; - -import java.io.IOException; -import java.lang.invoke.MethodHandles; -import java.lang.invoke.MethodType; -import java.util.Arrays; -import java.util.logging.Logger; -import org.apache.lucene.codecs.KnnVectorsFormat; -import org.apache.lucene.codecs.KnnVectorsReader; -import org.apache.lucene.codecs.KnnVectorsWriter; -import org.apache.lucene.index.SegmentReadState; -import org.apache.lucene.index.SegmentWriteState; - -/** - * Wraps Faiss to create and search vector - * indexes. This class is mainly for backwards compatibility with older versions of Java (<22), - * use underlying format directly after upgrade. - * - * @lucene.experimental - */ -public class FaissKnnVectorsFormatProvider extends KnnVectorsFormat { - private final KnnVectorsFormat delegate; - - public FaissKnnVectorsFormatProvider() { - this(new Object[0]); - } - - public FaissKnnVectorsFormatProvider(Object... args) { - super(FaissKnnVectorsFormatProvider.class.getSimpleName()); - - KnnVectorsFormat delegate; - try { - Class cls = - MethodHandles.lookup() - .findClass("org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat"); - - MethodType methodType = - MethodType.methodType( - void.class, Arrays.stream(args).map(Object::getClass).toArray(Class[]::new)); - - delegate = - (KnnVectorsFormat) - MethodHandles.lookup().findConstructor(cls, methodType).invokeWithArguments(args); - - } catch ( - @SuppressWarnings("unused") - ClassNotFoundException e) { - - delegate = null; - Logger.getLogger(getClass().getName()) - .warning("FaissKnnVectorsFormat class missing, this object is unusable!"); - - } catch (NoSuchMethodException | IllegalAccessException e) { - throw new LinkageError("FaissKnnVectorsFormat is missing correctly typed constructor", e); - } catch (Throwable t) { - throw new RuntimeException(t); - } - this.delegate = delegate; - } - - @Override - public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException { - return delegate.fieldsWriter(state); - } - - @Override - public KnnVectorsReader fieldsReader(SegmentReadState state) throws IOException { - return delegate.fieldsReader(state); - } - - @Override - public int getMaxDimensions(String fieldName) { - return delegate.getMaxDimensions(fieldName); - } -} diff --git a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java similarity index 86% rename from lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java index d78514d99a5e..d49c941cd4c1 100644 --- a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java @@ -38,6 +38,7 @@ import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; @@ -45,17 +46,24 @@ import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; +/** + * Read per-segment Faiss indexes and associated metadata. + * + * @lucene.experimental + */ public final class FaissKnnVectorsReader extends KnnVectorsReader { private final FlatVectorsReader rawVectorsReader; private final IndexInput meta, data; - private final Map indexMap; + private final Map indexMap; private final Arena arena; + private boolean closed; public FaissKnnVectorsReader(SegmentReadState state, FlatVectorsReader rawVectorsReader) throws IOException { this.rawVectorsReader = rawVectorsReader; this.indexMap = new HashMap<>(); - this.arena = Arena.ofConfined(); + this.arena = Arena.ofShared(); + this.closed = false; boolean failure = true; try { @@ -76,7 +84,7 @@ public FaissKnnVectorsReader(SegmentReadState state, FlatVectorsReader rawVector VERSION_CURRENT, state.context.withReadAdvice(ReadAdvice.RANDOM)); - Map.Entry entry; + Map.Entry entry; while ((entry = parseNextField(state)) != null) { this.indexMap.put(entry.getKey(), entry.getValue()); } @@ -107,8 +115,7 @@ private IndexInput openInput( return input; } - private Map.Entry parseNextField(SegmentReadState state) - throws IOException { + private Map.Entry parseNextField(SegmentReadState state) throws IOException { int fieldNumber = meta.readInt(); if (fieldNumber == -1) { return null; @@ -131,7 +138,8 @@ private Map.Entry parseNextField(SegmentReadState state) // Ensure timely cleanup .reinterpret(arena, LibFaissC::freeIndex); - return Map.entry(fieldInfo.name, indexPointer); + return Map.entry( + fieldInfo.name, new IndexEntry(indexPointer, fieldInfo.getVectorSimilarityFunction())); } @Override @@ -155,9 +163,9 @@ public ByteVectorValues getByteVectorValues(String field) { @Override public void search(String field, float[] vector, KnnCollector knnCollector, Bits acceptDocs) { - MemorySegment entry = indexMap.get(field); + IndexEntry entry = indexMap.get(field); if (entry != null) { - indexSearch(entry, vector, knnCollector, acceptDocs); + indexSearch(entry.indexPointer, entry.function, vector, knnCollector, acceptDocs); } } @@ -170,13 +178,11 @@ public void search(String field, byte[] vector, KnnCollector knnCollector, Bits @Override public void close() throws IOException { - rawVectorsReader.close(); - arena.close(); - if (meta != null) { - meta.close(); - } - if (data != null) { - data.close(); + if (closed == false) { + IOUtils.close(rawVectorsReader, arena::close, meta, data); + closed = true; } } + + private record IndexEntry(MemorySegment indexPointer, VectorSimilarityFunction function) {} } diff --git a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java similarity index 96% rename from lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java index 3044092b871d..6cdf6bbf69f6 100644 --- a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java @@ -25,7 +25,6 @@ import static org.apache.lucene.sandbox.codecs.faiss.LibFaissC.indexWrite; import java.io.IOException; -import java.io.UncheckedIOException; import java.lang.foreign.Arena; import java.lang.foreign.MemorySegment; import java.util.HashMap; @@ -48,6 +47,11 @@ import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.hnsw.IntToIntFunction; +/** + * Write per-segment Faiss indexes and associated metadata. + * + * @lucene.experimental + */ public final class FaissKnnVectorsWriter extends KnnVectorsWriter { private final String description, indexParams; private final FlatVectorsWriter rawVectorsWriter; @@ -184,13 +188,7 @@ public void finish() throws IOException { @Override public void close() throws IOException { - rawVectorsWriter.close(); - if (meta != null) { - meta.close(); - } - if (data != null) { - data.close(); - } + IOUtils.close(rawVectorsWriter, meta, data); } @Override @@ -232,11 +230,7 @@ public FloatVectorValues copy() { @Override public DocIndexIterator iterator() { - try { - return fromDISI(docIdSet.iterator()); - } catch (IOException e) { - throw new UncheckedIOException(e); - } + return fromDISI(docIdSet.iterator()); } } } diff --git a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java similarity index 89% rename from lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java rename to lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java index 0a22266256bc..9324feff607d 100644 --- a/lucene/sandbox/src/java22/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java @@ -20,6 +20,7 @@ import static java.lang.foreign.ValueLayout.JAVA_FLOAT; import static java.lang.foreign.ValueLayout.JAVA_INT; import static java.lang.foreign.ValueLayout.JAVA_LONG; +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; import java.io.IOException; import java.lang.foreign.Arena; @@ -45,6 +46,15 @@ import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.hnsw.IntToIntFunction; +/** + * Utility class to wrap necessary functions of the native C_API of Faiss using Project Panama (library {@value + * LibFaissC#LIBRARY_NAME}, version {@value LibFaissC#LIBRARY_VERSION}, build using this guide and + * add to runtime along with all dependencies). + * + * @lucene.experimental + */ public final class LibFaissC { /* * TODO: Requires some changes to Faiss @@ -57,15 +67,7 @@ public final class LibFaissC { public static final String LIBRARY_VERSION = "1.10.0"; static { - try { - System.loadLibrary(LIBRARY_NAME); - } catch (UnsatisfiedLinkError e) { - throw new RuntimeException( - "Shared library not found, build the Faiss C_API from https://github.com/facebookresearch/faiss/blob/main/c_api/INSTALL.md " - + "and link it (along with all dependencies) to the library path " - + "(-Djava.library.path JVM argument or $LD_LIBRARY_PATH environment variable)", - e); - } + System.loadLibrary(LIBRARY_NAME); checkLibraryVersion(); } @@ -183,7 +185,8 @@ public static MemorySegment createIndex( switch (function) { case DOT_PRODUCT -> 0; case EUCLIDEAN -> 1; - default -> throw new UnsupportedOperationException("Metric type not supported"); + case COSINE, MAXIMUM_INNER_PRODUCT -> + throw new UnsupportedOperationException("Metric type not supported"); }; // Create an index @@ -214,9 +217,9 @@ public static MemorySegment createIndex( LongBuffer idsBuffer = ids.asByteBuffer().order(ByteOrder.nativeOrder()).asLongBuffer(); KnnVectorValues.DocIndexIterator iterator = floatVectorValues.iterator(); - for (int i = 0; i < size; i++) { - idsBuffer.put(oldToNewDocId.apply(iterator.nextDoc())); - docsBuffer.put(floatVectorValues.vectorValue(i)); + for (int i = iterator.nextDoc(); i != NO_MORE_DOCS; i = iterator.nextDoc()) { + idsBuffer.put(oldToNewDocId.apply(i)); + docsBuffer.put(floatVectorValues.vectorValue(iterator.index())); } // Train index @@ -231,6 +234,7 @@ public static MemorySegment createIndex( } } + @SuppressWarnings("unused") // called using a MethodHandle private static int writeBytes( IndexOutput output, MemorySegment inputPointer, int itemSize, int numItems) throws IOException { @@ -242,6 +246,7 @@ private static int writeBytes( return numItems; } + @SuppressWarnings("unused") // called using a MethodHandle private static int readBytes( IndexInput input, MemorySegment outputPointer, int itemSize, int numItems) throws IOException { @@ -348,7 +353,12 @@ public static MemorySegment indexRead(IndexInput input, int ioFlags) { ADDRESS); public static void indexSearch( - MemorySegment indexPointer, float[] query, KnnCollector knnCollector, Bits acceptDocs) { + MemorySegment indexPointer, + VectorSimilarityFunction function, + float[] query, + KnnCollector knnCollector, + Bits acceptDocs) { + try (Arena temp = Arena.ofConfined()) { FixedBitSet fixedBitSet = switch (acceptDocs) { @@ -416,7 +426,23 @@ public static void indexSearch( // Record hits for (int i = 0; i < k; i++) { - knnCollector.collect((int) ids[i], distances[i]); + + // Scale Faiss distances to Lucene scores, see VectorSimilarityFunction.java + float score = + switch (function) { + case DOT_PRODUCT -> + // distance in Faiss === dotProduct in Lucene + Math.max((1 + distances[i]) / 2, 0); + + case EUCLIDEAN -> + // distance in Faiss === squareDistance in Lucene + 1 / (1 + distances[i]); + + case COSINE, MAXIMUM_INNER_PRODUCT -> + throw new UnsupportedOperationException("Metric type not supported"); + }; + + knnCollector.collect((int) ids[i], score); } } } @@ -449,6 +475,11 @@ private static void callAndHandleError(MethodHandle handle, Object... args) { } } + /** + * Exception used to rethrow handled Faiss errors in native code. + * + * @lucene.experimental + */ public static class FaissException extends RuntimeException { public FaissException(String message) { super(message); diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/package-info.java index e8938dfb3782..bb8cbd997434 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/package-info.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/package-info.java @@ -15,7 +15,9 @@ * limitations under the License. */ /** - * Wraps Faiss to create and search vector - * indexes via {@link org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormatProvider}. + * Provides a Faiss-based vector codec via {@link + * org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat}. + * + * @lucene.experimental */ package org.apache.lucene.sandbox.codecs.faiss; diff --git a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat index 418d70fb51fd..29a44d2ecfa8 100644 --- a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat +++ b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormatProvider +org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/faiss/TestFaissKnnVectorsFormat.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/faiss/TestFaissKnnVectorsFormat.java new file mode 100644 index 000000000000..4118f459349b --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/faiss/TestFaissKnnVectorsFormat.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.codecs.faiss; + +import static org.apache.lucene.index.VectorEncoding.FLOAT32; +import static org.apache.lucene.index.VectorSimilarityFunction.DOT_PRODUCT; +import static org.apache.lucene.index.VectorSimilarityFunction.EUCLIDEAN; + +import java.io.IOException; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.index.VectorEncoding; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; +import org.apache.lucene.tests.util.TestUtil; +import org.junit.BeforeClass; +import org.junit.Ignore; + +/** Tests for {@link FaissKnnVectorsFormat}. */ +public class TestFaissKnnVectorsFormat extends BaseKnnVectorsFormatTestCase { + private static final VectorEncoding[] SUPPORTED_ENCODINGS = {FLOAT32}; + private static final VectorSimilarityFunction[] SUPPORTED_FUNCTIONS = {DOT_PRODUCT, EUCLIDEAN}; + + @BeforeClass + public static void checkLibraryPresent() throws ClassNotFoundException { + boolean faissLibraryPresent; + try { + Class.forName("org.apache.lucene.sandbox.codecs.faiss.LibFaissC"); + faissLibraryPresent = true; + } catch (UnsatisfiedLinkError _) { + faissLibraryPresent = false; + } + assumeTrue("Native libraries present", faissLibraryPresent); + } + + @Override + protected VectorEncoding randomVectorEncoding() { + return SUPPORTED_ENCODINGS[random().nextInt(SUPPORTED_ENCODINGS.length)]; + } + + @Override + protected VectorSimilarityFunction randomSimilarity() { + return SUPPORTED_FUNCTIONS[random().nextInt(SUPPORTED_FUNCTIONS.length)]; + } + + @Override + protected Codec getCodec() { + return TestUtil.alwaysKnnVectorsFormat(new FaissKnnVectorsFormat()); + } + + @Override + public void testRecall() throws IOException { + // only supports some functions + for (VectorSimilarityFunction similarity : SUPPORTED_FUNCTIONS) { + assertRecall(similarity, 0.5, 1.0); + } + } + + @Override + @Ignore // does not honour visitedLimit + public void testSearchWithVisitedLimit() {} + + @Override + @Ignore // does not support byte vectors + public void testByteVectorScorerIteration() {} + + @Override + @Ignore // does not support byte vectors + public void testMismatchedFields() {} + + @Override + @Ignore // does not support byte vectors + public void testSortedIndexBytes() {} + + @Override + @Ignore // does not support byte vectors + public void testRandomBytes() {} + + @Override + @Ignore // does not support byte vectors + public void testEmptyByteVectorData() {} + + @Override + @Ignore // does not support byte vectors + public void testMergingWithDifferentByteKnnFields() {} +} From b91260076179a01198a00f40a0970c59ff56297a Mon Sep 17 00:00:00 2001 From: Kaival Parikh Date: Thu, 6 Mar 2025 08:57:36 +0000 Subject: [PATCH 08/17] Add / fix some TODOs --- .../sandbox/codecs/faiss/FaissKnnVectorsReader.java | 8 ++++---- .../sandbox/codecs/faiss/FaissKnnVectorsWriter.java | 8 ++++---- .../org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java | 6 +++++- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java index d49c941cd4c1..1772ec8ed380 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java @@ -156,8 +156,8 @@ public FloatVectorValues getFloatVectorValues(String field) throws IOException { @Override public ByteVectorValues getByteVectorValues(String field) { - // TODO: Support using SQ8 quantization, see - // https://github.com/opensearch-project/k-NN/pull/2425 + // TODO: Support using SQ8 quantization, see: + // - https://github.com/opensearch-project/k-NN/pull/2425 throw new UnsupportedOperationException("Byte vectors not supported"); } @@ -171,8 +171,8 @@ public void search(String field, float[] vector, KnnCollector knnCollector, Bits @Override public void search(String field, byte[] vector, KnnCollector knnCollector, Bits acceptDocs) { - // TODO: Support using SQ8 quantization, see - // https://github.com/opensearch-project/k-NN/pull/2425 + // TODO: Support using SQ8 quantization, see: + // - https://github.com/opensearch-project/k-NN/pull/2425 throw new UnsupportedOperationException("Byte vectors not supported"); } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java index 6cdf6bbf69f6..c5c504e5eede 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java @@ -99,8 +99,8 @@ public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOE rawVectorsWriter.mergeOneField(fieldInfo, mergeState); switch (fieldInfo.getVectorEncoding()) { case BYTE -> - // TODO: Support using SQ8 quantization, see - // https://github.com/opensearch-project/k-NN/pull/2425 + // TODO: Support using SQ8 quantization, see: + // - https://github.com/opensearch-project/k-NN/pull/2425 throw new UnsupportedOperationException("Byte vectors not supported"); case FLOAT32 -> { FloatVectorValues merged = @@ -124,8 +124,8 @@ public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException { FieldInfo fieldInfo = entry.getKey(); switch (fieldInfo.getVectorEncoding()) { case BYTE -> - // TODO: Support using SQ8 quantization, see - // https://github.com/opensearch-project/k-NN/pull/2425 + // TODO: Support using SQ8 quantization, see: + // - https://github.com/opensearch-project/k-NN/pull/2425 throw new UnsupportedOperationException("Byte vectors not supported"); case FLOAT32 -> { diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java index 9324feff607d..ae5b05a083de 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java @@ -57,7 +57,7 @@ */ public final class LibFaissC { /* - * TODO: Requires some changes to Faiss + * TODO: Requires some changes to Faiss, see: * - https://github.com/facebookresearch/faiss/pull/4158 (merged in main, to be released in v1.11.0) * - https://github.com/facebookresearch/faiss/pull/4167 (merged in main, to be released in v1.11.0) * - https://github.com/facebookresearch/faiss/pull/4180 (in progress) @@ -208,6 +208,10 @@ public static MemorySegment createIndex( indexPointer, temp.allocateFrom(indexParams)); + // TODO: Improve memory usage (with a tradeoff in performance) by batched indexing, see: + // - https://github.com/opensearch-project/k-NN/issues/1506 + // - https://github.com/opensearch-project/k-NN/issues/1938 + // Allocate docs in native memory MemorySegment docs = temp.allocate(JAVA_FLOAT, (long) size * dimension); FloatBuffer docsBuffer = docs.asByteBuffer().order(ByteOrder.nativeOrder()).asFloatBuffer(); From fa27a844b028e34be74f80fcfd20861c23150918 Mon Sep 17 00:00:00 2001 From: Kaival Parikh Date: Tue, 11 Mar 2025 08:07:57 +0000 Subject: [PATCH 09/17] Add GitHub workflow for testing the Faiss codec --- .../workflows/run-special-checks-sandbox.yml | 81 +++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 .github/workflows/run-special-checks-sandbox.yml diff --git a/.github/workflows/run-special-checks-sandbox.yml b/.github/workflows/run-special-checks-sandbox.yml new file mode 100644 index 000000000000..43cf1c5e4f92 --- /dev/null +++ b/.github/workflows/run-special-checks-sandbox.yml @@ -0,0 +1,81 @@ +name: "Run special checks: module lucene/sandbox" + +on: + workflow_dispatch: + + pull_request: + branches: + - '*' + + push: + branches: + - 'main' + - 'branch_10x' + +jobs: + faiss-tests: + name: tests for the Faiss codec (JDK ${{ matrix.java }} on ${{ matrix.os }}) + timeout-minutes: 15 + + strategy: + matrix: + os: [ ubuntu-latest ] + java: [ '23' ] + + runs-on: ${{ matrix.os }} + + steps: + - name: Setup conda for dependencies + uses: conda-incubator/setup-miniconda@v3 + with: + auto-activate-base: false + auto-update-conda: true + activate-environment: faiss + channels: pytorch + conda-remove-defaults: true + miniforge-version: latest + + # TODO: Not needed after https://github.com/facebookresearch/faiss/pull/4186 is released in v1.11.0, the C_API will be published to Conda + - name: Checkout Faiss + uses: actions/checkout@v4 + with: + # TODO: Change to facebookresearch/faiss@main after https://github.com/facebookresearch/faiss/pull/4180 is merged + repository: kaivalnp/faiss + ref: custom_io_c + path: faiss + + # TODO: Not needed after https://github.com/facebookresearch/faiss/pull/4186 is released in v1.11.0, the C_API will be published to Conda + - name: Build C_API of Faiss + working-directory: faiss + run: | + conda install faiss-cpu=1.10.0 cmake=3.26 make=4.2 + echo "$CONDA/bin" >> $GITHUB_PATH + cmake -B build \ + -DBUILD_TESTING=OFF \ + -DFAISS_ENABLE_PYTHON=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DFAISS_ENABLE_GPU=OFF \ + -DFAISS_ENABLE_CUVS=OFF \ + -DFAISS_ENABLE_ROCM=OFF \ + -DFAISS_OPT_LEVEL=generic \ + -DFAISS_ENABLE_C_API=ON \ + -DCMAKE_BUILD_TYPE=Release \ + . + make -k -C build -j$(nproc) + env: + LD_LIBRARY_PATH: ${{ env.CONDA_PREFIX }}/lib + + - name: Checkout Lucene + uses: actions/checkout@v4 + + - name: Prepare Lucene workspace + uses: ./.github/actions/prepare-for-build + + - name: Run tests for Faiss codec + run: ./gradlew -p lucene/sandbox test --tests "org.apache.lucene.sandbox.codecs.faiss.*" + env: + LD_LIBRARY_PATH: ${{ github.workspace }}/faiss/build/c_api:${{ env.CONDA_PREFIX }}/lib + + defaults: + run: + shell: bash -el {0} From bbb3bf380da0227d78a60e7e6fdb9cd298ca79cf Mon Sep 17 00:00:00 2001 From: Kaival Parikh Date: Tue, 11 Mar 2025 19:31:33 +0000 Subject: [PATCH 10/17] Misc changes - Handle incomplete results - Add note about internal threading - Only keep the format public, rest as package-private --- .../lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java | 2 ++ .../lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java | 2 +- .../lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java | 2 +- .../org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java | 6 +++++- 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java index 6796e0a9ecd8..dbe8a085e54e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java @@ -40,6 +40,8 @@ *
  • All files required by {@link Lucene99FlatVectorsFormat} for storing raw vectors. * * + *

    Note: Set the {@code $OMP_NUM_THREADS} environment variable to control internal threading. + * * @lucene.experimental */ public final class FaissKnnVectorsFormat extends KnnVectorsFormat { diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java index 1772ec8ed380..ae00e3a6bcc1 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java @@ -51,7 +51,7 @@ * * @lucene.experimental */ -public final class FaissKnnVectorsReader extends KnnVectorsReader { +final class FaissKnnVectorsReader extends KnnVectorsReader { private final FlatVectorsReader rawVectorsReader; private final IndexInput meta, data; private final Map indexMap; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java index c5c504e5eede..fd3ce274b0be 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java @@ -52,7 +52,7 @@ * * @lucene.experimental */ -public final class FaissKnnVectorsWriter extends KnnVectorsWriter { +final class FaissKnnVectorsWriter extends KnnVectorsWriter { private final String description, indexParams; private final FlatVectorsWriter rawVectorsWriter; private final IndexOutput meta, data; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java index ae5b05a083de..9dfeccbf4e8d 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java @@ -55,7 +55,7 @@ * * @lucene.experimental */ -public final class LibFaissC { +final class LibFaissC { /* * TODO: Requires some changes to Faiss, see: * - https://github.com/facebookresearch/faiss/pull/4158 (merged in main, to be released in v1.11.0) @@ -430,6 +430,10 @@ public static void indexSearch( // Record hits for (int i = 0; i < k; i++) { + // Not enough results + if (ids[i] == -1) { + break; + } // Scale Faiss distances to Lucene scores, see VectorSimilarityFunction.java float score = From 717012b50a3ee54eaa5962873a6bc4c0ee099b66 Mon Sep 17 00:00:00 2001 From: Kaival Parikh Date: Wed, 2 Apr 2025 07:59:54 +0000 Subject: [PATCH 11/17] Follow-up to Faiss PRs - Use built library from Conda - Enforce tests to run in CI - Misc fixes --- .../workflows/run-special-checks-sandbox.yml | 42 ++++--------------- gradle/testing/randomization.gradle | 1 + .../codecs/faiss/FaissKnnVectorsWriter.java | 8 +++- .../sandbox/codecs/faiss/LibFaissC.java | 14 +++---- .../faiss/TestFaissKnnVectorsFormat.java | 16 ++++++- 5 files changed, 35 insertions(+), 46 deletions(-) diff --git a/.github/workflows/run-special-checks-sandbox.yml b/.github/workflows/run-special-checks-sandbox.yml index 43cf1c5e4f92..d62e76cbb10a 100644 --- a/.github/workflows/run-special-checks-sandbox.yml +++ b/.github/workflows/run-special-checks-sandbox.yml @@ -25,7 +25,7 @@ jobs: runs-on: ${{ matrix.os }} steps: - - name: Setup conda for dependencies + - name: Setup Conda uses: conda-incubator/setup-miniconda@v3 with: auto-activate-base: false @@ -35,35 +35,9 @@ jobs: conda-remove-defaults: true miniforge-version: latest - # TODO: Not needed after https://github.com/facebookresearch/faiss/pull/4186 is released in v1.11.0, the C_API will be published to Conda - - name: Checkout Faiss - uses: actions/checkout@v4 - with: - # TODO: Change to facebookresearch/faiss@main after https://github.com/facebookresearch/faiss/pull/4180 is merged - repository: kaivalnp/faiss - ref: custom_io_c - path: faiss - - # TODO: Not needed after https://github.com/facebookresearch/faiss/pull/4186 is released in v1.11.0, the C_API will be published to Conda - - name: Build C_API of Faiss - working-directory: faiss - run: | - conda install faiss-cpu=1.10.0 cmake=3.26 make=4.2 - echo "$CONDA/bin" >> $GITHUB_PATH - cmake -B build \ - -DBUILD_TESTING=OFF \ - -DFAISS_ENABLE_PYTHON=OFF \ - -DBUILD_SHARED_LIBS=ON \ - -DFAISS_ENABLE_GPU=OFF \ - -DFAISS_ENABLE_CUVS=OFF \ - -DFAISS_ENABLE_ROCM=OFF \ - -DFAISS_OPT_LEVEL=generic \ - -DFAISS_ENABLE_C_API=ON \ - -DCMAKE_BUILD_TYPE=Release \ - . - make -k -C build -j$(nproc) - env: - LD_LIBRARY_PATH: ${{ env.CONDA_PREFIX }}/lib + - name: Install Faiss + # TODO: Uses the nightly version (@main) until v1.11.0 is released + run: conda install pytorch/label/nightly::faiss-cpu - name: Checkout Lucene uses: actions/checkout@v4 @@ -72,9 +46,11 @@ jobs: uses: ./.github/actions/prepare-for-build - name: Run tests for Faiss codec - run: ./gradlew -p lucene/sandbox test --tests "org.apache.lucene.sandbox.codecs.faiss.*" - env: - LD_LIBRARY_PATH: ${{ github.workspace }}/faiss/build/c_api:${{ env.CONDA_PREFIX }}/lib + run: > + LD_LIBRARY_PATH=$CONDA_PREFIX/lib + ./gradlew -p lucene/sandbox + -Dtests.faiss.run=true + test --tests "org.apache.lucene.sandbox.codecs.faiss.*" defaults: run: diff --git a/gradle/testing/randomization.gradle b/gradle/testing/randomization.gradle index 185cd0872a9c..eaa33712cd6b 100644 --- a/gradle/testing/randomization.gradle +++ b/gradle/testing/randomization.gradle @@ -116,6 +116,7 @@ allprojects { description: "Forces use of integer vectors even when slow."], [propName: 'tests.defaultvectorization', value: false, description: "Uses defaults for running tests with correct JVM settings to test Panama vectorization (tests.jvmargs, tests.vectorsize, tests.forceintegervectors)."], + [propName: 'tests.faiss.run', value: false, description: "Explicitly run tests for the Faiss codec."], ] } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java index fd3ce274b0be..96f499c6de74 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java @@ -57,7 +57,7 @@ final class FaissKnnVectorsWriter extends KnnVectorsWriter { private final FlatVectorsWriter rawVectorsWriter; private final IndexOutput meta, data; private final Map> rawFields; - private boolean finished; + private boolean closed, finished; public FaissKnnVectorsWriter( String description, @@ -70,6 +70,7 @@ public FaissKnnVectorsWriter( this.indexParams = indexParams; this.rawVectorsWriter = rawVectorsWriter; this.rawFields = new HashMap<>(); + this.closed = false; this.finished = false; boolean failure = true; @@ -188,7 +189,10 @@ public void finish() throws IOException { @Override public void close() throws IOException { - IOUtils.close(rawVectorsWriter, meta, data); + if (closed == false) { + IOUtils.close(rawVectorsWriter, meta, data); + closed = true; + } } @Override diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java index 9dfeccbf4e8d..feead956ee9a 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java @@ -48,22 +48,18 @@ /** * Utility class to wrap necessary functions of the native C_API of Faiss using Project Panama (library {@value - * LibFaissC#LIBRARY_NAME}, version {@value LibFaissC#LIBRARY_VERSION}, build using Project Panama (install from Conda or build using this guide and * add to runtime along with all dependencies). * * @lucene.experimental */ final class LibFaissC { - /* - * TODO: Requires some changes to Faiss, see: - * - https://github.com/facebookresearch/faiss/pull/4158 (merged in main, to be released in v1.11.0) - * - https://github.com/facebookresearch/faiss/pull/4167 (merged in main, to be released in v1.11.0) - * - https://github.com/facebookresearch/faiss/pull/4180 (in progress) - */ - + // TODO: Use vectorized version where available public static final String LIBRARY_NAME = "faiss_c"; + + // TODO: Uses the nightly version (@main) until v1.11.0 is released public static final String LIBRARY_VERSION = "1.10.0"; static { diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/faiss/TestFaissKnnVectorsFormat.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/faiss/TestFaissKnnVectorsFormat.java index 4118f459349b..4239e3d0b3b1 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/faiss/TestFaissKnnVectorsFormat.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/codecs/faiss/TestFaissKnnVectorsFormat.java @@ -29,13 +29,25 @@ import org.junit.BeforeClass; import org.junit.Ignore; -/** Tests for {@link FaissKnnVectorsFormat}. */ +/** + * Tests for {@link FaissKnnVectorsFormat}. Will run only if required shared libraries (including + * dependencies) are present at runtime, or the {@value #FAISS_RUN_TESTS} JVM arg is set to {@code + * true} + */ public class TestFaissKnnVectorsFormat extends BaseKnnVectorsFormatTestCase { + private static final String FAISS_RUN_TESTS = "tests.faiss.run"; + private static final VectorEncoding[] SUPPORTED_ENCODINGS = {FLOAT32}; private static final VectorSimilarityFunction[] SUPPORTED_FUNCTIONS = {DOT_PRODUCT, EUCLIDEAN}; @BeforeClass - public static void checkLibraryPresent() throws ClassNotFoundException { + public static void maybeSuppress() throws ClassNotFoundException { + // Explicitly run tests + if (Boolean.getBoolean(FAISS_RUN_TESTS)) { + return; + } + + // Otherwise check if dependencies are present boolean faissLibraryPresent; try { Class.forName("org.apache.lucene.sandbox.codecs.faiss.LibFaissC"); From ed2e6d44d7814130634d6c9b1f38f6e803de2815 Mon Sep 17 00:00:00 2001 From: Kaival Parikh Date: Sat, 26 Apr 2025 11:58:11 +0000 Subject: [PATCH 12/17] Set Faiss to v1.11.0 - Faiss v1.11.0 was recently released (https://github.com/facebookresearch/faiss/releases/tag/v1.11.0) - Also switch Conda distribution to micromamba, which has a permissive license --- .../workflows/run-special-checks-sandbox.yml | 28 +++++++------------ .../sandbox/codecs/faiss/LibFaissC.java | 7 +++-- 2 files changed, 14 insertions(+), 21 deletions(-) diff --git a/.github/workflows/run-special-checks-sandbox.yml b/.github/workflows/run-special-checks-sandbox.yml index d62e76cbb10a..955679c11c91 100644 --- a/.github/workflows/run-special-checks-sandbox.yml +++ b/.github/workflows/run-special-checks-sandbox.yml @@ -25,19 +25,13 @@ jobs: runs-on: ${{ matrix.os }} steps: - - name: Setup Conda - uses: conda-incubator/setup-miniconda@v3 - with: - auto-activate-base: false - auto-update-conda: true - activate-environment: faiss - channels: pytorch - conda-remove-defaults: true - miniforge-version: latest - - name: Install Faiss - # TODO: Uses the nightly version (@main) until v1.11.0 is released - run: conda install pytorch/label/nightly::faiss-cpu + uses: mamba-org/setup-micromamba@v2 + id: setup + with: + environment-name: faiss-env + condarc: 'channels: [pytorch, conda-forge]' + create-args: faiss-cpu=1.11.0 - name: Checkout Lucene uses: actions/checkout@v4 @@ -46,12 +40,10 @@ jobs: uses: ./.github/actions/prepare-for-build - name: Run tests for Faiss codec - run: > - LD_LIBRARY_PATH=$CONDA_PREFIX/lib - ./gradlew -p lucene/sandbox - -Dtests.faiss.run=true - test --tests "org.apache.lucene.sandbox.codecs.faiss.*" + env: + LD_LIBRARY_PATH: ${{ steps.setup.outputs.environment-path }}/lib + run: ./gradlew -p lucene/sandbox -Dtests.faiss.run=true test --tests "org.apache.lucene.sandbox.codecs.faiss.*" defaults: run: - shell: bash -el {0} + shell: bash -leo pipefail {0} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java index feead956ee9a..b744d74bf994 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java @@ -53,14 +53,15 @@ * href="https://github.com/facebookresearch/faiss/blob/main/c_api/INSTALL.md">this guide and * add to runtime along with all dependencies). * + *

    Important Note: When installing from Conda, ensure that the license of the distribution and + * channels being used is applicable to you! + * * @lucene.experimental */ final class LibFaissC { // TODO: Use vectorized version where available public static final String LIBRARY_NAME = "faiss_c"; - - // TODO: Uses the nightly version (@main) until v1.11.0 is released - public static final String LIBRARY_VERSION = "1.10.0"; + public static final String LIBRARY_VERSION = "1.11.0"; static { System.loadLibrary(LIBRARY_NAME); From 56f39d33fac5cf7ba8bfab2909d831cc453e93be Mon Sep 17 00:00:00 2001 From: Kaival Parikh Date: Sat, 26 Apr 2025 13:16:53 +0000 Subject: [PATCH 13/17] Minor fix - Delegate off-heap byte size function to raw format --- .../lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java index ae00e3a6bcc1..7e5b7bddfb77 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java @@ -176,6 +176,12 @@ public void search(String field, byte[] vector, KnnCollector knnCollector, Bits throw new UnsupportedOperationException("Byte vectors not supported"); } + @Override + public Map getOffHeapByteSize(FieldInfo fieldInfo) { + // TODO: How to estimate Faiss usage? + return rawVectorsReader.getOffHeapByteSize(fieldInfo); + } + @Override public void close() throws IOException { if (closed == false) { From 52a1acde2dab614a1b4022c9a0e340bce5faf3da Mon Sep 17 00:00:00 2001 From: Kaival Parikh Date: Tue, 6 May 2025 18:24:44 +0000 Subject: [PATCH 14/17] Minor fixes - Bump Java version to 24 after recent commit - Add comment about using only conda-forge - Refactor and make Faiss version explicit --- .github/workflows/run-special-checks-sandbox.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/run-special-checks-sandbox.yml b/.github/workflows/run-special-checks-sandbox.yml index 955679c11c91..d54485b309da 100644 --- a/.github/workflows/run-special-checks-sandbox.yml +++ b/.github/workflows/run-special-checks-sandbox.yml @@ -14,13 +14,14 @@ on: jobs: faiss-tests: - name: tests for the Faiss codec (JDK ${{ matrix.java }} on ${{ matrix.os }}) + name: tests for the Faiss codec (v${{ matrix.faiss-version }} with JDK ${{ matrix.java }} on ${{ matrix.os }}) timeout-minutes: 15 strategy: matrix: os: [ ubuntu-latest ] - java: [ '23' ] + java: [ '24' ] + faiss-version: [ '1.11.0' ] runs-on: ${{ matrix.os }} @@ -30,8 +31,9 @@ jobs: id: setup with: environment-name: faiss-env + # TODO: Use only conda-forge if possible, see https://github.com/conda-forge/faiss-split-feedstock/pull/88 condarc: 'channels: [pytorch, conda-forge]' - create-args: faiss-cpu=1.11.0 + create-args: faiss-cpu=${{ matrix.faiss-version }} - name: Checkout Lucene uses: actions/checkout@v4 From 018d15db7717380719c0f9c0feb425b403ad8b4d Mon Sep 17 00:00:00 2001 From: Kaival Parikh Date: Tue, 20 May 2025 16:45:17 +0000 Subject: [PATCH 15/17] Rebase changes - withReadAdvice -> withHints --- .../lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java index 7e5b7bddfb77..07bee0e1a9df 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java @@ -40,9 +40,10 @@ import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.store.DataAccessHint; +import org.apache.lucene.store.FileTypeHint; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.ReadAdvice; import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; @@ -82,7 +83,7 @@ public FaissKnnVectorsReader(SegmentReadState state, FlatVectorsReader rawVector DATA_CODEC_NAME, VERSION_START, VERSION_CURRENT, - state.context.withReadAdvice(ReadAdvice.RANDOM)); + state.context.withHints(FileTypeHint.DATA, DataAccessHint.RANDOM)); Map.Entry entry; while ((entry = parseNextField(state)) != null) { From f16171672d62ac99d14a04f981ceff971274769a Mon Sep 17 00:00:00 2001 From: Kaival Parikh Date: Mon, 9 Jun 2025 05:09:01 +0000 Subject: [PATCH 16/17] Some final changes - Remove rethrowing of Faiss exception which is thread-unsafe and causes SIGSEGVs! - Stronger IO handling in vector reader / writer - Implement buffered index IO - Add docs to demonstrate how to install Faiss and use the format - CHANGES.txt entry --- .../workflows/run-special-checks-sandbox.yml | 10 +- lucene/CHANGES.txt | 2 + .../codecs/faiss/FaissKnnVectorsFormat.java | 6 +- .../codecs/faiss/FaissKnnVectorsReader.java | 126 +++++---- .../codecs/faiss/FaissKnnVectorsWriter.java | 55 ++-- .../sandbox/codecs/faiss/LibFaissC.java | 243 +++++++++++------- .../sandbox/codecs/faiss/package-info.java | 27 +- 7 files changed, 292 insertions(+), 177 deletions(-) diff --git a/.github/workflows/run-special-checks-sandbox.yml b/.github/workflows/run-special-checks-sandbox.yml index d54485b309da..8c21a7ab0e40 100644 --- a/.github/workflows/run-special-checks-sandbox.yml +++ b/.github/workflows/run-special-checks-sandbox.yml @@ -27,16 +27,18 @@ jobs: steps: - name: Install Faiss - uses: mamba-org/setup-micromamba@v2 + uses: mamba-org/setup-micromamba@b09ef9b599704322748535812ca03efb2625677b #v2.0.5 id: setup with: environment-name: faiss-env # TODO: Use only conda-forge if possible, see https://github.com/conda-forge/faiss-split-feedstock/pull/88 - condarc: 'channels: [pytorch, conda-forge]' - create-args: faiss-cpu=${{ matrix.faiss-version }} + create-args: >- + -c pytorch + -c conda-forge + faiss-cpu=${{ matrix.faiss-version }} - name: Checkout Lucene - uses: actions/checkout@v4 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Prepare Lucene workspace uses: ./.github/actions/prepare-for-build diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 1113c7b5e8e5..379a03d827d5 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -30,6 +30,8 @@ New Features --------------------- * GITHUB#14097: Binary partitioning merge policy over float-valued vector field. (Mike Sokolov) +* GITHUB#14178: Add a Faiss-based vector format in the sandbox module. (Kaival Parikh) + Improvements --------------------- diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java index dbe8a085e54e..92f0d5b0a8d0 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java @@ -31,6 +31,8 @@ * A format which uses Faiss to create and * search vector indexes, using {@link LibFaissC} to interact with the native library. * + *

    TODO: There is no guarantee of backwards compatibility! + * *

    A separate Faiss index is created per-segment, and uses the following files: * *

      @@ -40,7 +42,9 @@ *
    • All files required by {@link Lucene99FlatVectorsFormat} for storing raw vectors. *
    * - *

    Note: Set the {@code $OMP_NUM_THREADS} environment variable to control internal threading. + *

    Note: Set the {@code $OMP_NUM_THREADS} environment variable to control internal + * threading. * * @lucene.experimental */ diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java index 07bee0e1a9df..a85f2b53d4bb 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java @@ -22,27 +22,33 @@ import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.META_EXTENSION; import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.VERSION_CURRENT; import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.VERSION_START; +import static org.apache.lucene.sandbox.codecs.faiss.LibFaissC.FAISS_IO_FLAG_MMAP; +import static org.apache.lucene.sandbox.codecs.faiss.LibFaissC.FAISS_IO_FLAG_READ_ONLY; import static org.apache.lucene.sandbox.codecs.faiss.LibFaissC.indexRead; import static org.apache.lucene.sandbox.codecs.faiss.LibFaissC.indexSearch; import java.io.IOException; import java.lang.foreign.Arena; import java.lang.foreign.MemorySegment; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; +import java.util.Locale; import java.util.Map; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.hnsw.FlatVectorsReader; import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.DataAccessHint; import org.apache.lucene.store.FileTypeHint; -import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; @@ -54,7 +60,7 @@ */ final class FaissKnnVectorsReader extends KnnVectorsReader { private final FlatVectorsReader rawVectorsReader; - private final IndexInput meta, data; + private final IndexInput data; private final Map indexMap; private final Arena arena; private boolean closed; @@ -66,57 +72,71 @@ public FaissKnnVectorsReader(SegmentReadState state, FlatVectorsReader rawVector this.arena = Arena.ofShared(); this.closed = false; - boolean failure = true; - try { - meta = - openInput( - state, - META_EXTENSION, - META_CODEC_NAME, - VERSION_START, - VERSION_CURRENT, - state.context); - data = - openInput( - state, - DATA_EXTENSION, + List fieldMetaList = new ArrayList<>(); + String metaFileName = + IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, META_EXTENSION); + try (ChecksumIndexInput meta = state.directory.openChecksumInput(metaFileName)) { + Throwable priorE = null; + int versionMeta = -1; + try { + versionMeta = + CodecUtil.checkIndexHeader( + meta, + META_CODEC_NAME, + VERSION_START, + VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + + FieldMeta fieldMeta; + while ((fieldMeta = parseNextField(meta, state)) != null) { + fieldMetaList.add(fieldMeta); + } + } catch (Throwable t) { + priorE = t; + } finally { + CodecUtil.checkFooter(meta, priorE); + } + + String dataFileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, DATA_EXTENSION); + this.data = + state.directory.openInput( + dataFileName, state.context.withHints(FileTypeHint.DATA, DataAccessHint.RANDOM)); + + int versionData = + CodecUtil.checkIndexHeader( + this.data, DATA_CODEC_NAME, VERSION_START, VERSION_CURRENT, - state.context.withHints(FileTypeHint.DATA, DataAccessHint.RANDOM)); - - Map.Entry entry; - while ((entry = parseNextField(state)) != null) { - this.indexMap.put(entry.getKey(), entry.getValue()); + state.segmentInfo.getId(), + state.segmentSuffix); + if (versionMeta != versionData) { + throw new CorruptIndexException( + String.format( + Locale.ROOT, + "Format versions mismatch (meta=%d, data=%d)", + versionMeta, + versionData), + data); } + CodecUtil.retrieveChecksum(data); - failure = false; - } finally { - if (failure) { - IOUtils.closeWhileHandlingException(this); + for (FieldMeta fieldMeta : fieldMetaList) { + if (indexMap.put(fieldMeta.fieldInfo.name, loadField(data, arena, fieldMeta)) != null) { + throw new CorruptIndexException("Duplicate field: " + fieldMeta.fieldInfo.name, meta); + } } + } catch (Throwable t) { + IOUtils.closeWhileSuppressingExceptions(t, this); + throw t; } } - @SuppressWarnings("SameParameterValue") - private IndexInput openInput( - SegmentReadState state, - String extension, - String codecName, - int versionStart, - int versionEnd, - IOContext context) + private static FieldMeta parseNextField(IndexInput meta, SegmentReadState state) throws IOException { - - String fileName = - IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, extension); - IndexInput input = state.directory.openInput(fileName, context); - CodecUtil.checkIndexHeader( - input, codecName, versionStart, versionEnd, state.segmentInfo.getId(), state.segmentSuffix); - return input; - } - - private Map.Entry parseNextField(SegmentReadState state) throws IOException { int fieldNumber = meta.readInt(); if (fieldNumber == -1) { return null; @@ -124,29 +144,31 @@ private Map.Entry parseNextField(SegmentReadState state) thr FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNumber); if (fieldInfo == null) { - throw new IllegalStateException("Invalid field"); + throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta); } long dataOffset = meta.readLong(); long dataLength = meta.readLong(); - // See flags defined in c_api/index_io_c.h - int ioFlags = 3; + return new FieldMeta(fieldInfo, dataOffset, dataLength); + } + + private static IndexEntry loadField(IndexInput data, Arena arena, FieldMeta fieldMeta) + throws IOException { + int ioFlags = FAISS_IO_FLAG_MMAP | FAISS_IO_FLAG_READ_ONLY; // Read index into memory MemorySegment indexPointer = - indexRead(data.slice(fieldInfo.name, dataOffset, dataLength), ioFlags) + indexRead(data.slice(fieldMeta.fieldInfo.name, fieldMeta.offset, fieldMeta.length), ioFlags) // Ensure timely cleanup .reinterpret(arena, LibFaissC::freeIndex); - return Map.entry( - fieldInfo.name, new IndexEntry(indexPointer, fieldInfo.getVectorSimilarityFunction())); + return new IndexEntry(indexPointer, fieldMeta.fieldInfo.getVectorSimilarityFunction()); } @Override public void checkIntegrity() throws IOException { rawVectorsReader.checkIntegrity(); - CodecUtil.checksumEntireFile(meta); CodecUtil.checksumEntireFile(data); } @@ -186,10 +208,12 @@ public Map getOffHeapByteSize(FieldInfo fieldInfo) { @Override public void close() throws IOException { if (closed == false) { - IOUtils.close(rawVectorsReader, arena::close, meta, data); closed = true; + IOUtils.close(rawVectorsReader, arena::close, data, indexMap::clear); } } + private record FieldMeta(FieldInfo fieldInfo, long offset, long length) {} + private record IndexEntry(MemorySegment indexPointer, VectorSimilarityFunction function) {} } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java index 96f499c6de74..0336e85e607e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java @@ -21,6 +21,8 @@ import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.META_CODEC_NAME; import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.META_EXTENSION; import static org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.VERSION_CURRENT; +import static org.apache.lucene.sandbox.codecs.faiss.LibFaissC.FAISS_IO_FLAG_MMAP; +import static org.apache.lucene.sandbox.codecs.faiss.LibFaissC.FAISS_IO_FLAG_READ_ONLY; import static org.apache.lucene.sandbox.codecs.faiss.LibFaissC.createIndex; import static org.apache.lucene.sandbox.codecs.faiss.LibFaissC.indexWrite; @@ -57,7 +59,7 @@ final class FaissKnnVectorsWriter extends KnnVectorsWriter { private final FlatVectorsWriter rawVectorsWriter; private final IndexOutput meta, data; private final Map> rawFields; - private boolean closed, finished; + private boolean finished; public FaissKnnVectorsWriter( String description, @@ -70,31 +72,36 @@ public FaissKnnVectorsWriter( this.indexParams = indexParams; this.rawVectorsWriter = rawVectorsWriter; this.rawFields = new HashMap<>(); - this.closed = false; this.finished = false; - boolean failure = true; try { - this.meta = openOutput(state, META_EXTENSION, META_CODEC_NAME); - this.data = openOutput(state, DATA_EXTENSION, DATA_CODEC_NAME); - failure = false; - } finally { - if (failure) { - IOUtils.closeWhileHandlingException(this); - } + String metaFileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, META_EXTENSION); + this.meta = state.directory.createOutput(metaFileName, state.context); + CodecUtil.writeIndexHeader( + this.meta, + META_CODEC_NAME, + VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + + String dataFileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, DATA_EXTENSION); + this.data = state.directory.createOutput(dataFileName, state.context); + CodecUtil.writeIndexHeader( + this.data, + DATA_CODEC_NAME, + VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + } catch (Throwable t) { + IOUtils.closeWhileSuppressingExceptions(t, this); + throw t; } } - private IndexOutput openOutput(SegmentWriteState state, String extension, String codecName) - throws IOException { - String fileName = - IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, extension); - IndexOutput output = state.directory.createOutput(fileName, state.context); - CodecUtil.writeIndexHeader( - output, codecName, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); - return output; - } - @Override public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { rawVectorsWriter.mergeOneField(fieldInfo, mergeState); @@ -161,8 +168,7 @@ private void writeFloatField( // Ensure timely cleanup .reinterpret(temp, LibFaissC::freeIndex); - // See flags defined in c_api/index_io_c.h - int ioFlags = 3; + int ioFlags = FAISS_IO_FLAG_MMAP | FAISS_IO_FLAG_READ_ONLY; // Write index long dataOffset = data.getFilePointer(); @@ -189,10 +195,7 @@ public void finish() throws IOException { @Override public void close() throws IOException { - if (closed == false) { - IOUtils.close(rawVectorsWriter, meta, data); - closed = true; - } + IOUtils.close(rawVectorsWriter, meta, data); } @Override diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java index b744d74bf994..c521c4c20108 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/LibFaissC.java @@ -26,7 +26,6 @@ import java.lang.foreign.Arena; import java.lang.foreign.FunctionDescriptor; import java.lang.foreign.Linker; -import java.lang.foreign.MemoryLayout; import java.lang.foreign.MemorySegment; import java.lang.foreign.SymbolLookup; import java.lang.invoke.MethodHandle; @@ -35,6 +34,7 @@ import java.nio.ByteOrder; import java.nio.FloatBuffer; import java.nio.LongBuffer; +import java.util.Arrays; import java.util.Locale; import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.KnnVectorValues; @@ -47,14 +47,9 @@ import org.apache.lucene.util.hnsw.IntToIntFunction; /** - * Utility class to wrap necessary functions of the native C_API of Faiss using Project Panama (install from Conda or build using this guide and - * add to runtime along with all dependencies). - * - *

    Important Note: When installing from Conda, ensure that the license of the distribution and - * channels being used is applicable to you! + * Utility class to wrap necessary functions of the native C API of Faiss + * using Project Panama. * * @lucene.experimental */ @@ -63,6 +58,12 @@ final class LibFaissC { public static final String LIBRARY_NAME = "faiss_c"; public static final String LIBRARY_VERSION = "1.11.0"; + // See flags defined in c_api/index_io_c.h + static final int FAISS_IO_FLAG_MMAP = 1; + static final int FAISS_IO_FLAG_READ_ONLY = 2; + + private static final int BUFFER_SIZE = 256 * 1024 * 1024; // 256 MB + static { System.loadLibrary(LIBRARY_NAME); checkLibraryVersion(); @@ -70,24 +71,24 @@ final class LibFaissC { private LibFaissC() {} - @SuppressWarnings("SameParameterValue") private static MemorySegment getUpcallStub( - Arena arena, MethodHandle target, MemoryLayout resLayout, MemoryLayout... argLayouts) { - return Linker.nativeLinker() - .upcallStub(target, FunctionDescriptor.of(resLayout, argLayouts), arena); + Arena arena, MethodHandle target, FunctionDescriptor descriptor) { + return Linker.nativeLinker().upcallStub(target, descriptor, arena); } private static MethodHandle getDowncallHandle( - String functionName, MemoryLayout resLayout, MemoryLayout... argLayouts) { + String functionName, FunctionDescriptor descriptor) { return Linker.nativeLinker() - .downcallHandle( - SymbolLookup.loaderLookup().find(functionName).orElseThrow(), - FunctionDescriptor.of(resLayout, argLayouts)); + .downcallHandle(SymbolLookup.loaderLookup().findOrThrow(functionName), descriptor); } private static void checkLibraryVersion() { - MethodHandle getVersion = getDowncallHandle("faiss_get_version", ADDRESS); - String actualVersion = callAndGetString(getVersion); + MethodHandle getVersion = + getDowncallHandle("faiss_get_version", FunctionDescriptor.of(ADDRESS)); + + MemorySegment nativeString = call(getVersion); + String actualVersion = nativeString.reinterpret(Long.MAX_VALUE).getString(0); + if (LIBRARY_VERSION.equals(actualVersion) == false) { throw new UnsupportedOperationException( String.format( @@ -99,71 +100,80 @@ private static void checkLibraryVersion() { } private static final MethodHandle FREE_INDEX = - getDowncallHandle("faiss_Index_free", JAVA_INT, ADDRESS); + getDowncallHandle("faiss_Index_free", FunctionDescriptor.ofVoid(ADDRESS)); public static void freeIndex(MemorySegment indexPointer) { - callAndHandleError(FREE_INDEX, indexPointer); + call(FREE_INDEX, indexPointer); } private static final MethodHandle FREE_CUSTOM_IO_WRITER = - getDowncallHandle("faiss_CustomIOWriter_free", JAVA_INT, ADDRESS); + getDowncallHandle("faiss_CustomIOWriter_free", FunctionDescriptor.ofVoid(ADDRESS)); public static void freeCustomIOWriter(MemorySegment customIOWriterPointer) { - callAndHandleError(FREE_CUSTOM_IO_WRITER, customIOWriterPointer); + call(FREE_CUSTOM_IO_WRITER, customIOWriterPointer); } private static final MethodHandle FREE_CUSTOM_IO_READER = - getDowncallHandle("faiss_CustomIOReader_free", JAVA_INT, ADDRESS); + getDowncallHandle("faiss_CustomIOReader_free", FunctionDescriptor.ofVoid(ADDRESS)); public static void freeCustomIOReader(MemorySegment customIOReaderPointer) { - callAndHandleError(FREE_CUSTOM_IO_READER, customIOReaderPointer); + call(FREE_CUSTOM_IO_READER, customIOReaderPointer); } private static final MethodHandle FREE_PARAMETER_SPACE = - getDowncallHandle("faiss_ParameterSpace_free", JAVA_INT, ADDRESS); + getDowncallHandle("faiss_ParameterSpace_free", FunctionDescriptor.ofVoid(ADDRESS)); private static void freeParameterSpace(MemorySegment parameterSpacePointer) { - callAndHandleError(FREE_PARAMETER_SPACE, parameterSpacePointer); + call(FREE_PARAMETER_SPACE, parameterSpacePointer); } private static final MethodHandle FREE_ID_SELECTOR_BITMAP = - getDowncallHandle("faiss_IDSelectorBitmap_free", JAVA_INT, ADDRESS); + getDowncallHandle("faiss_IDSelectorBitmap_free", FunctionDescriptor.ofVoid(ADDRESS)); private static void freeIDSelectorBitmap(MemorySegment idSelectorBitmapPointer) { - callAndHandleError(FREE_ID_SELECTOR_BITMAP, idSelectorBitmapPointer); + call(FREE_ID_SELECTOR_BITMAP, idSelectorBitmapPointer); } private static final MethodHandle FREE_SEARCH_PARAMETERS = - getDowncallHandle("faiss_SearchParameters_free", JAVA_INT, ADDRESS); + getDowncallHandle("faiss_SearchParameters_free", FunctionDescriptor.ofVoid(ADDRESS)); private static void freeSearchParameters(MemorySegment searchParametersPointer) { - callAndHandleError(FREE_SEARCH_PARAMETERS, searchParametersPointer); + call(FREE_SEARCH_PARAMETERS, searchParametersPointer); } private static final MethodHandle INDEX_FACTORY = - getDowncallHandle("faiss_index_factory", JAVA_INT, ADDRESS, JAVA_INT, ADDRESS, JAVA_INT); + getDowncallHandle( + "faiss_index_factory", + FunctionDescriptor.of(JAVA_INT, ADDRESS, JAVA_INT, ADDRESS, JAVA_INT)); private static final MethodHandle PARAMETER_SPACE_NEW = - getDowncallHandle("faiss_ParameterSpace_new", JAVA_INT, ADDRESS); + getDowncallHandle("faiss_ParameterSpace_new", FunctionDescriptor.of(JAVA_INT, ADDRESS)); private static final MethodHandle SET_INDEX_PARAMETERS = getDowncallHandle( - "faiss_ParameterSpace_set_index_parameters", JAVA_INT, ADDRESS, ADDRESS, ADDRESS); + "faiss_ParameterSpace_set_index_parameters", + FunctionDescriptor.of(JAVA_INT, ADDRESS, ADDRESS, ADDRESS)); private static final MethodHandle ID_SELECTOR_BITMAP_NEW = - getDowncallHandle("faiss_IDSelectorBitmap_new", JAVA_INT, ADDRESS, JAVA_LONG, ADDRESS); + getDowncallHandle( + "faiss_IDSelectorBitmap_new", + FunctionDescriptor.of(JAVA_INT, ADDRESS, JAVA_LONG, ADDRESS)); private static final MethodHandle SEARCH_PARAMETERS_NEW = - getDowncallHandle("faiss_SearchParameters_new", JAVA_INT, ADDRESS, ADDRESS); + getDowncallHandle( + "faiss_SearchParameters_new", FunctionDescriptor.of(JAVA_INT, ADDRESS, ADDRESS)); private static final MethodHandle INDEX_IS_TRAINED = - getDowncallHandle("faiss_Index_is_trained", JAVA_INT, ADDRESS); + getDowncallHandle("faiss_Index_is_trained", FunctionDescriptor.of(JAVA_INT, ADDRESS)); private static final MethodHandle INDEX_TRAIN = - getDowncallHandle("faiss_Index_train", JAVA_INT, ADDRESS, JAVA_LONG, ADDRESS); + getDowncallHandle( + "faiss_Index_train", FunctionDescriptor.of(JAVA_INT, ADDRESS, JAVA_LONG, ADDRESS)); private static final MethodHandle INDEX_ADD_WITH_IDS = - getDowncallHandle("faiss_Index_add_with_ids", JAVA_INT, ADDRESS, JAVA_LONG, ADDRESS, ADDRESS); + getDowncallHandle( + "faiss_Index_add_with_ids", + FunctionDescriptor.of(JAVA_INT, ADDRESS, JAVA_LONG, ADDRESS, ADDRESS)); public static MemorySegment createIndex( String description, @@ -224,7 +234,8 @@ public static MemorySegment createIndex( } // Train index - if (callAndGetInt(INDEX_IS_TRAINED, indexPointer) == 0) { + int isTrained = call(INDEX_IS_TRAINED, indexPointer); + if (isTrained == 0) { callAndHandleError(INDEX_TRAIN, indexPointer, size, docs); } @@ -236,30 +247,58 @@ public static MemorySegment createIndex( } @SuppressWarnings("unused") // called using a MethodHandle - private static int writeBytes( - IndexOutput output, MemorySegment inputPointer, int itemSize, int numItems) + private static long writeBytes( + IndexOutput output, MemorySegment inputPointer, long itemSize, long numItems) throws IOException { - // TODO: Can we avoid copying to heap? - byte[] bytes = - new byte[(int) (Integer.toUnsignedLong(itemSize) * Integer.toUnsignedLong(numItems))]; - inputPointer.reinterpret(bytes.length).asByteBuffer().order(ByteOrder.nativeOrder()).get(bytes); - output.writeBytes(bytes, 0, bytes.length); + long size = itemSize * numItems; + inputPointer = inputPointer.reinterpret(size); + + if (size <= BUFFER_SIZE) { // simple case, avoid buffering + byte[] bytes = new byte[(int) size]; + inputPointer.asSlice(0, size).asByteBuffer().order(ByteOrder.nativeOrder()).get(bytes); + output.writeBytes(bytes, bytes.length); + } else { // copy buffered number of bytes repeatedly + byte[] bytes = new byte[BUFFER_SIZE]; + for (long offset = 0; offset < size; offset += BUFFER_SIZE) { + int length = (int) Math.min(size - offset, BUFFER_SIZE); + inputPointer + .asSlice(offset, length) + .asByteBuffer() + .order(ByteOrder.nativeOrder()) + .get(bytes, 0, length); + output.writeBytes(bytes, length); + } + } return numItems; } @SuppressWarnings("unused") // called using a MethodHandle - private static int readBytes( - IndexInput input, MemorySegment outputPointer, int itemSize, int numItems) + private static long readBytes( + IndexInput input, MemorySegment outputPointer, long itemSize, long numItems) throws IOException { - // TODO: Can we avoid copying to heap? - byte[] bytes = - new byte[(int) (Integer.toUnsignedLong(itemSize) * Integer.toUnsignedLong(numItems))]; - input.readBytes(bytes, 0, bytes.length); - outputPointer - .reinterpret(bytes.length) - .asByteBuffer() - .order(ByteOrder.nativeOrder()) - .put(bytes); + long size = itemSize * numItems; + outputPointer = outputPointer.reinterpret(size); + + if (size <= BUFFER_SIZE) { // simple case, avoid buffering + byte[] bytes = new byte[(int) size]; + input.readBytes(bytes, 0, bytes.length); + outputPointer + .asSlice(0, bytes.length) + .asByteBuffer() + .order(ByteOrder.nativeOrder()) + .put(bytes); + } else { // copy buffered number of bytes repeatedly + byte[] bytes = new byte[BUFFER_SIZE]; + for (long offset = 0; offset < size; offset += BUFFER_SIZE) { + int length = (int) Math.min(size - offset, BUFFER_SIZE); + input.readBytes(bytes, 0, length); + outputPointer + .asSlice(offset, length) + .asByteBuffer() + .order(ByteOrder.nativeOrder()) + .put(bytes, 0, length); + } + } return numItems; } @@ -274,7 +313,7 @@ private static int readBytes( LibFaissC.class, "writeBytes", MethodType.methodType( - int.class, IndexOutput.class, MemorySegment.class, int.class, int.class)); + long.class, IndexOutput.class, MemorySegment.class, long.class, long.class)); READ_BYTES_HANDLE = MethodHandles.lookup() @@ -282,23 +321,26 @@ private static int readBytes( LibFaissC.class, "readBytes", MethodType.methodType( - int.class, IndexInput.class, MemorySegment.class, int.class, int.class)); + long.class, IndexInput.class, MemorySegment.class, long.class, long.class)); } catch (NoSuchMethodException | IllegalAccessException e) { throw new RuntimeException(e); } } private static final MethodHandle CUSTOM_IO_WRITER_NEW = - getDowncallHandle("faiss_CustomIOWriter_new", JAVA_INT, ADDRESS, ADDRESS); + getDowncallHandle( + "faiss_CustomIOWriter_new", FunctionDescriptor.of(JAVA_INT, ADDRESS, ADDRESS)); private static final MethodHandle WRITE_INDEX_CUSTOM = - getDowncallHandle("faiss_write_index_custom", JAVA_INT, ADDRESS, ADDRESS, JAVA_INT); + getDowncallHandle( + "faiss_write_index_custom", FunctionDescriptor.of(JAVA_INT, ADDRESS, ADDRESS, JAVA_INT)); public static void indexWrite(MemorySegment indexPointer, IndexOutput output, int ioFlags) { try (Arena temp = Arena.ofConfined()) { MethodHandle writerHandle = WRITE_BYTES_HANDLE.bindTo(output); MemorySegment writerStub = - getUpcallStub(temp, writerHandle, JAVA_INT, ADDRESS, JAVA_INT, JAVA_INT); + getUpcallStub( + temp, writerHandle, FunctionDescriptor.of(JAVA_LONG, ADDRESS, JAVA_LONG, JAVA_LONG)); MemorySegment pointer = temp.allocate(ADDRESS); callAndHandleError(CUSTOM_IO_WRITER_NEW, pointer, writerStub); @@ -313,16 +355,19 @@ public static void indexWrite(MemorySegment indexPointer, IndexOutput output, in } private static final MethodHandle CUSTOM_IO_READER_NEW = - getDowncallHandle("faiss_CustomIOReader_new", JAVA_INT, ADDRESS, ADDRESS); + getDowncallHandle( + "faiss_CustomIOReader_new", FunctionDescriptor.of(JAVA_INT, ADDRESS, ADDRESS)); private static final MethodHandle READ_INDEX_CUSTOM = - getDowncallHandle("faiss_read_index_custom", JAVA_INT, ADDRESS, JAVA_INT, ADDRESS); + getDowncallHandle( + "faiss_read_index_custom", FunctionDescriptor.of(JAVA_INT, ADDRESS, JAVA_INT, ADDRESS)); public static MemorySegment indexRead(IndexInput input, int ioFlags) { try (Arena temp = Arena.ofConfined()) { MethodHandle readerHandle = READ_BYTES_HANDLE.bindTo(input); MemorySegment readerStub = - getUpcallStub(temp, readerHandle, JAVA_INT, ADDRESS, JAVA_INT, JAVA_INT); + getUpcallStub( + temp, readerHandle, FunctionDescriptor.of(JAVA_LONG, ADDRESS, JAVA_LONG, JAVA_LONG)); MemorySegment pointer = temp.allocate(ADDRESS); callAndHandleError(CUSTOM_IO_READER_NEW, pointer, readerStub); @@ -339,19 +384,15 @@ public static MemorySegment indexRead(IndexInput input, int ioFlags) { private static final MethodHandle INDEX_SEARCH = getDowncallHandle( - "faiss_Index_search", JAVA_INT, ADDRESS, JAVA_LONG, ADDRESS, JAVA_LONG, ADDRESS, ADDRESS); + "faiss_Index_search", + FunctionDescriptor.of( + JAVA_INT, ADDRESS, JAVA_LONG, ADDRESS, JAVA_LONG, ADDRESS, ADDRESS)); private static final MethodHandle INDEX_SEARCH_WITH_PARAMS = getDowncallHandle( "faiss_Index_search_with_params", - JAVA_INT, - ADDRESS, - JAVA_LONG, - ADDRESS, - JAVA_LONG, - ADDRESS, - ADDRESS, - ADDRESS); + FunctionDescriptor.of( + JAVA_INT, ADDRESS, JAVA_LONG, ADDRESS, JAVA_LONG, ADDRESS, ADDRESS, ADDRESS)); public static void indexSearch( MemorySegment indexPointer, @@ -452,31 +493,20 @@ public static void indexSearch( } } - private static final MethodHandle GET_LAST_ERROR = - getDowncallHandle("faiss_get_last_error", ADDRESS); - - private static int callAndGetInt(MethodHandle handle, Object... args) { + @SuppressWarnings("unchecked") + private static T call(MethodHandle handle, Object... args) { try { - return (int) handle.invokeWithArguments(args); - } catch (Throwable e) { - throw new RuntimeException(e); - } - } - - private static String callAndGetString(MethodHandle handle, Object... args) { - try { - MemorySegment segment = (MemorySegment) handle.invokeWithArguments(args); - return segment.reinterpret(Long.MAX_VALUE).getString(0); + return (T) handle.invokeWithArguments(args); } catch (Throwable e) { throw new RuntimeException(e); } } private static void callAndHandleError(MethodHandle handle, Object... args) { - int returnCode = callAndGetInt(handle, args); + int returnCode = call(handle, args); if (returnCode < 0) { - String error = callAndGetString(GET_LAST_ERROR); - throw new FaissException(error); + // TODO: Surface actual exception in a thread-safe manner? + throw new FaissException(returnCode); } } @@ -486,8 +516,33 @@ private static void callAndHandleError(MethodHandle handle, Object... args) { * @lucene.experimental */ public static class FaissException extends RuntimeException { - public FaissException(String message) { - super(message); + // See error codes defined in c_api/error_c.h + enum ErrorCode { + /// No error + OK(0), + /// Any exception other than Faiss or standard C++ library exceptions + UNKNOWN_EXCEPT(-1), + /// Faiss library exception + FAISS_EXCEPT(-2), + /// Standard C++ library exception + STD_EXCEPT(-4); + + private final int code; + + ErrorCode(int code) { + this.code = code; + } + + static ErrorCode fromCode(int code) { + return Arrays.stream(ErrorCode.values()) + .filter(errorCode -> errorCode.code == code) + .findFirst() + .orElseThrow(); + } + } + + public FaissException(int code) { + super(String.format(Locale.ROOT, "Faiss library ran into %s", ErrorCode.fromCode(code))); } } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/package-info.java index bb8cbd997434..319c904d4030 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/package-info.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/package-info.java @@ -15,9 +15,34 @@ * limitations under the License. */ /** - * Provides a Faiss-based vector codec via {@link + * Provides a Faiss-based vector format via {@link * org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat}. * + *

    To use this format: Install pytorch/faiss-cpu v{@value + * org.apache.lucene.sandbox.codecs.faiss.LibFaissC#LIBRARY_VERSION} from Conda and place shared libraries (including + * dependencies) on the {@code $LD_LIBRARY_PATH} environment variable or {@code -Djava.library.path} + * JVM argument. + * + *

    Important: Ensure that the license of the Conda distribution and channels is applicable to + * you. pytorch and conda-forge are community-maintained channels with + * permissive licenses! + * + *

    Sample setup: + * + *

      + *
    • Install micromamba (an open-source Conda + * package manager) + *
    • Install dependencies using {@code micromamba create -n faiss-env -c pytorch -c conda-forge + * -y faiss-cpu=}{@value org.apache.lucene.sandbox.codecs.faiss.LibFaissC#LIBRARY_VERSION} + *
    • Activate environment using {@code micromamba activate faiss-env} + *
    • Add shared libraries to runtime using {@code export LD_LIBRARY_PATH=$CONDA_PREFIX/lib} + *
    • And you're good to go! (add the {@code -Dtests.faiss.run=true} JVM argument to ensure Faiss + * tests are run) + *
    + * * @lucene.experimental */ package org.apache.lucene.sandbox.codecs.faiss; From 1f38d8dcfa2a8f307992e902a71ea9c7a9a81973 Mon Sep 17 00:00:00 2001 From: Kaival Parikh Date: Tue, 17 Jun 2025 20:23:55 +0000 Subject: [PATCH 17/17] Misc fixes - Use conda-incubator/setup-miniconda to install Mamba - Improve some documentation --- .../workflows/run-special-checks-sandbox.yml | 27 +++++++++------- .../codecs/faiss/FaissKnnVectorsFormat.java | 31 ++++++++++++++++--- .../codecs/faiss/FaissKnnVectorsReader.java | 1 + .../sandbox/codecs/faiss/package-info.java | 7 +++-- 4 files changed, 49 insertions(+), 17 deletions(-) diff --git a/.github/workflows/run-special-checks-sandbox.yml b/.github/workflows/run-special-checks-sandbox.yml index 8c21a7ab0e40..1d18b3a060fe 100644 --- a/.github/workflows/run-special-checks-sandbox.yml +++ b/.github/workflows/run-special-checks-sandbox.yml @@ -26,16 +26,18 @@ jobs: runs-on: ${{ matrix.os }} steps: - - name: Install Faiss - uses: mamba-org/setup-micromamba@b09ef9b599704322748535812ca03efb2625677b #v2.0.5 - id: setup + - name: Install Mamba + uses: conda-incubator/setup-miniconda@835234971496cad1653abb28a638a281cf32541f #v3.2.0 with: - environment-name: faiss-env + miniforge-version: 'latest' + auto-activate-base: 'false' + activate-environment: 'faiss-env' # TODO: Use only conda-forge if possible, see https://github.com/conda-forge/faiss-split-feedstock/pull/88 - create-args: >- - -c pytorch - -c conda-forge - faiss-cpu=${{ matrix.faiss-version }} + channels: 'pytorch,conda-forge' + conda-remove-defaults: 'true' + + - name: Install Faiss + run: mamba install faiss-cpu=${{ matrix.faiss-version }} - name: Checkout Lucene uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -44,9 +46,12 @@ jobs: uses: ./.github/actions/prepare-for-build - name: Run tests for Faiss codec - env: - LD_LIBRARY_PATH: ${{ steps.setup.outputs.environment-path }}/lib - run: ./gradlew -p lucene/sandbox -Dtests.faiss.run=true test --tests "org.apache.lucene.sandbox.codecs.faiss.*" + run: > + LD_LIBRARY_PATH=$CONDA_PREFIX/lib + ./gradlew -p lucene/sandbox + -Dtests.faiss.run=true + test + --tests "org.apache.lucene.sandbox.codecs.faiss.*" defaults: run: diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java index 92f0d5b0a8d0..83beae607dc5 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsFormat.java @@ -16,6 +16,9 @@ */ package org.apache.lucene.sandbox.codecs.faiss; +import static org.apache.lucene.util.hnsw.HnswGraphBuilder.DEFAULT_BEAM_WIDTH; +import static org.apache.lucene.util.hnsw.HnswGraphBuilder.DEFAULT_MAX_CONN; + import java.io.IOException; import java.util.Locale; import org.apache.lucene.codecs.KnnVectorsFormat; @@ -28,10 +31,14 @@ import org.apache.lucene.index.SegmentWriteState; /** - * A format which uses Faiss to create and - * search vector indexes, using {@link LibFaissC} to interact with the native library. + * A Faiss-based format to create and search vector indexes, using {@link LibFaissC} to interact + * with the native library. * - *

    TODO: There is no guarantee of backwards compatibility! + *

    The Faiss index is configured using its flexible index factory, which + * allows creating arbitrary indexes by "describing" them. These indexes can be tuned by setting + * relevant parameters. * *

    A separate Faiss index is created per-segment, and uses the following files: * @@ -46,6 +53,8 @@ * href="https://github.com/facebookresearch/faiss/wiki/Threads-and-asynchronous-calls">internal * threading. * + *

    TODO: There is no guarantee of backwards compatibility! + * * @lucene.experimental */ public final class FaissKnnVectorsFormat extends KnnVectorsFormat { @@ -61,10 +70,24 @@ public final class FaissKnnVectorsFormat extends KnnVectorsFormat { private final String indexParams; private final FlatVectorsFormat rawVectorsFormat; + /** + * Constructs an HNSW-based format using default {@code maxConn}={@value + * org.apache.lucene.util.hnsw.HnswGraphBuilder#DEFAULT_MAX_CONN} and {@code beamWidth}={@value + * org.apache.lucene.util.hnsw.HnswGraphBuilder#DEFAULT_BEAM_WIDTH}. + */ public FaissKnnVectorsFormat() { - this("IDMap,HNSW32", "efConstruction=200"); + this( + String.format(Locale.ROOT, "IDMap,HNSW%d", DEFAULT_MAX_CONN), + String.format(Locale.ROOT, "efConstruction=%d", DEFAULT_BEAM_WIDTH)); } + /** + * Constructs a format using the specified index factory string and index parameters (see class + * docs for more information). + * + * @param description the index factory string to initialize Faiss indexes. + * @param indexParams the index params to set on Faiss indexes. + */ public FaissKnnVectorsFormat(String description, String indexParams) { super(NAME); this.description = description; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java index a85f2b53d4bb..42a95145fbd4 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsReader.java @@ -169,6 +169,7 @@ private static IndexEntry loadField(IndexInput data, Arena arena, FieldMeta fiel @Override public void checkIntegrity() throws IOException { rawVectorsReader.checkIntegrity(); + // TODO: Evaluate if we need an explicit check for validity of Faiss indexes CodecUtil.checksumEntireFile(data); } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/package-info.java index 319c904d4030..e63fa3070f96 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/package-info.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/package-info.java @@ -15,7 +15,10 @@ * limitations under the License. */ /** - * Provides a Faiss-based vector format via {@link + * Faiss is "a library for efficient + * similarity search and clustering of dense vectors", with support for various vector + * transforms, indexing algorithms, quantization techniques, etc. This package provides a pluggable + * Faiss-based format to perform vector searches in Lucene, via {@link * org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat}. * *

    To use this format: Install *

  • Install micromamba (an open-source Conda - * package manager) + * package manager) or similar *
  • Install dependencies using {@code micromamba create -n faiss-env -c pytorch -c conda-forge * -y faiss-cpu=}{@value org.apache.lucene.sandbox.codecs.faiss.LibFaissC#LIBRARY_VERSION} *
  • Activate environment using {@code micromamba activate faiss-env}