diff --git a/build-tools/build-infra/build.gradle b/build-tools/build-infra/build.gradle index 5cb1426cba97..34d71f7509d3 100644 --- a/build-tools/build-infra/build.gradle +++ b/build-tools/build-infra/build.gradle @@ -22,6 +22,7 @@ plugins { } repositories { + mavenLocal() mavenCentral() } diff --git a/gradle/globals.gradle b/gradle/globals.gradle index bcab6461ea91..25bfddc9bebf 100644 --- a/gradle/globals.gradle +++ b/gradle/globals.gradle @@ -22,6 +22,7 @@ allprojects { // Repositories to fetch dependencies from. repositories { + mavenLocal() mavenCentral() } diff --git a/gradle/testing/defaults-tests.gradle b/gradle/testing/defaults-tests.gradle index 14e64647d667..b636162ea96d 100644 --- a/gradle/testing/defaults-tests.gradle +++ b/gradle/testing/defaults-tests.gradle @@ -143,7 +143,7 @@ allprojects { ':lucene:codecs', ":lucene:distribution.tests", ":lucene:test-framework" - ] ? 'ALL-UNNAMED' : 'org.apache.lucene.core') + ] ? 'ALL-UNNAMED' : 'org.apache.lucene.core,com.nvidia.cuvs') // TODO: make this sandbox only def loggingConfigFile = layout.projectDirectory.file("${resources}/logging.properties") def tempDir = layout.projectDirectory.dir(testsTmpDir.toString()) diff --git a/gradle/testing/randomization/policies/tests.policy b/gradle/testing/randomization/policies/tests.policy index f8e09ba03661..41cb5d60e44e 100644 --- a/gradle/testing/randomization/policies/tests.policy +++ b/gradle/testing/randomization/policies/tests.policy @@ -80,6 +80,12 @@ grant { permission java.io.FilePermission "${hunspell.corpora}${/}-", "read"; permission java.io.FilePermission "${hunspell.dictionaries}", "read"; permission java.io.FilePermission "${hunspell.dictionaries}${/}-", "read"; + + // TODO: these are just temporary to allow tesing with cuvs-java + permission java.lang.RuntimePermission "getenv.CUVS_JAVA_SO_PATH"; + permission java.io.FilePermission "${/}-", "read"; + // For temporary files to communicate with cuvs + permission java.io.FilePermission "${/}tmp${/}-", "write,delete"; }; // Permissions for jacoco code coverage diff --git a/lucene/licenses/commons-LICENSE-ASL.txt b/lucene/licenses/commons-LICENSE-ASL.txt new file mode 100644 index 000000000000..d64569567334 --- /dev/null +++ b/lucene/licenses/commons-LICENSE-ASL.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/lucene/licenses/commons-NOTICE.txt b/lucene/licenses/commons-NOTICE.txt new file mode 100644 index 000000000000..554991d39bcf --- /dev/null +++ b/lucene/licenses/commons-NOTICE.txt @@ -0,0 +1,197 @@ +Apache Lucene +Copyright 2001-2025 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +Includes software from other Apache Software Foundation projects, +including, but not limited to: + - Apache Jakarta Regexp + - Apache Commons + - Apache Xerces + +ICU4J, (under analysis/icu) is licensed under an MIT styles license +and Copyright (c) 1995-2008 International Business Machines Corporation and others + +Some data files (under analysis/icu/src/data) are derived from Unicode data such +as the Unicode Character Database. See http://unicode.org/copyright.html for more +details. + +Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is +BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/ + +The levenshtein automata tables (under core/src/java/org/apache/lucene/util/automaton) were +automatically generated with the moman/finenight FSA library, created by +Jean-Philippe Barrette-LaPierre. This library is available under an MIT license, +see http://sites.google.com/site/rrettesite/moman and +http://bitbucket.org/jpbarrette/moman/overview/ + +The class org.apache.lucene.util.WeakIdentityMap was derived from +the Apache CXF project and is Apache License 2.0. + +The class org.apache.lucene.util.compress.LZ4 is a Java rewrite of the LZ4 +compression library (https://github.com/lz4/lz4/tree/dev/lib) that is licensed +under the 2-clause BSD license. +(https://opensource.org/licenses/bsd-license.php) + +The Google Code Prettify is Apache License 2.0. +See http://code.google.com/p/google-code-prettify/ + +This product includes code (JaspellTernarySearchTrie) from Java Spelling Checkin +g Package (jaspell): http://jaspell.sourceforge.net/ +License: The BSD License (http://www.opensource.org/licenses/bsd-license.php) + +The snowball stemmers in + analysis/common/src/java/net/sf/snowball +were developed by Martin Porter and Richard Boulton. +The snowball stopword lists in + analysis/common/src/resources/org/apache/lucene/analysis/snowball +were developed by Martin Porter and Richard Boulton. +The full snowball package is available from + https://snowballstem.org/ + +The KStem stemmer in + analysis/common/src/org/apache/lucene/analysis/en +was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst) +under the BSD-license. + +The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default +stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: +analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt +See http://members.unine.ch/jacques.savoy/clef/index.html. + +The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers +(common) are based on BSD-licensed reference implementations created by Jacques Savoy and +Ljiljana Dolamic. These files reside in: +analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java + +The Stempel analyzer (stempel) includes BSD-licensed software developed +by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil, +and Edmond Nolan. + +The Polish analyzer (stempel) comes with a default +stopword list that is BSD-licensed created by the Carrot2 project. The file resides +in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt. +See https://github.com/carrot2/carrot2. + +The SmartChineseAnalyzer source code (smartcn) was +provided by Xiaoping Gao and copyright 2009 by www.imdict.net. + +WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/) +is derived from Unicode data such as the Unicode Character Database. +See http://unicode.org/copyright.html for more details. + +The Morfologik analyzer (morfologik) includes BSD-licensed software +developed by Dawid Weiss and Marcin Miłkowski +(https://github.com/morfologik/morfologik-stemming) and uses +data from the BSD-licensed dictionary of Polish (SGJP, http://sgjp.pl/morfeusz/). + +=========================================================================== +Kuromoji Japanese Morphological Analyzer - Apache Lucene Integration +=========================================================================== + +This software includes a binary and/or source version of data from + + mecab-ipadic-2.7.0-20070801 + +which can be obtained from + + http://atilika.com/releases/mecab-ipadic/mecab-ipadic-2.7.0-20070801.tar.gz + +or + + http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz + +=========================================================================== +mecab-ipadic-2.7.0-20070801 Notice +=========================================================================== + +Nara Institute of Science and Technology (NAIST), +the copyright holders, disclaims all warranties with regard to this +software, including all implied warranties of merchantability and +fitness, in no event shall NAIST be liable for +any special, indirect or consequential damages or any damages +whatsoever resulting from loss of use, data or profits, whether in an +action of contract, negligence or other tortuous action, arising out +of or in connection with the use or performance of this software. + +A large portion of the dictionary entries +originate from ICOT Free Software. The following conditions for ICOT +Free Software applies to the current dictionary as well. + +Each User may also freely distribute the Program, whether in its +original form or modified, to any third party or parties, PROVIDED +that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear +on, or be attached to, the Program, which is distributed substantially +in the same form as set out herein and that such intended +distribution, if actually made, will neither violate or otherwise +contravene any of the laws and regulations of the countries having +jurisdiction over the User or the intended distribution itself. + +NO WARRANTY + +The program was produced on an experimental basis in the course of the +research and development conducted during the project and is provided +to users as so produced on an experimental basis. Accordingly, the +program is provided without any warranty whatsoever, whether express, +implied, statutory or otherwise. The term "warranty" used herein +includes, but is not limited to, any warranty of the quality, +performance, merchantability and fitness for a particular purpose of +the program and the nonexistence of any infringement or violation of +any right of any third party. + +Each user of the program will agree and understand, and be deemed to +have agreed and understood, that there is no warranty whatsoever for +the program and, accordingly, the entire risk arising from or +otherwise connected with the program is assumed by the user. + +Therefore, neither ICOT, the copyright holder, or any other +organization that participated in or was otherwise related to the +development of the program and their respective officials, directors, +officers and other employees shall be held liable for any and all +damages, including, without limitation, general, special, incidental +and consequential damages, arising out of or otherwise in connection +with the use or inability to use the program or any product, material +or result produced or otherwise obtained by using the program, +regardless of whether they have been advised of, or otherwise had +knowledge of, the possibility of such damages at any time during the +project or thereafter. Each user will be deemed to have agreed to the +foregoing by his or her commencement of use of the program. The term +"use" as used herein includes, but is not limited to, the use, +modification, copying and distribution of the program and the +production of secondary products from the program. + +In the case where the program, whether in its original form or +modified, was distributed or delivered to or received by a user from +any person, organization or entity other than ICOT, unless it makes or +grants independently of ICOT any specific warranty to the user in +writing, such person, organization or entity, will also be exempted +from and not be held liable to the user for any such damages as noted +above as far as the program is concerned. + +=========================================================================== +Nori Korean Morphological Analyzer - Apache Lucene Integration +=========================================================================== + +This software includes a binary and/or source version of data from + + mecab-ko-dic-2.1.1-20180720 + +which can be obtained from + + https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.1.1-20180720.tar.gz diff --git a/lucene/licenses/commons-lang3-3.17.0.jar.sha1 b/lucene/licenses/commons-lang3-3.17.0.jar.sha1 new file mode 100644 index 000000000000..f64174593b1c --- /dev/null +++ b/lucene/licenses/commons-lang3-3.17.0.jar.sha1 @@ -0,0 +1 @@ +b17d2136f0460dcc0d2016ceefca8723bdf4ee70 diff --git a/lucene/licenses/cuvs-java-25.02.0.jar.sha1 b/lucene/licenses/cuvs-java-25.02.0.jar.sha1 new file mode 100644 index 000000000000..ccb02e86aa8c --- /dev/null +++ b/lucene/licenses/cuvs-java-25.02.0.jar.sha1 @@ -0,0 +1 @@ +0086126edbd145e5d0be65e6157e96e3e8a2ebca diff --git a/lucene/licenses/cuvs-java-LICENSE-ASL.txt b/lucene/licenses/cuvs-java-LICENSE-ASL.txt new file mode 100644 index 000000000000..d64569567334 --- /dev/null +++ b/lucene/licenses/cuvs-java-LICENSE-ASL.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/lucene/licenses/cuvs-java-NOTICE.txt b/lucene/licenses/cuvs-java-NOTICE.txt new file mode 100644 index 000000000000..554991d39bcf --- /dev/null +++ b/lucene/licenses/cuvs-java-NOTICE.txt @@ -0,0 +1,197 @@ +Apache Lucene +Copyright 2001-2025 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +Includes software from other Apache Software Foundation projects, +including, but not limited to: + - Apache Jakarta Regexp + - Apache Commons + - Apache Xerces + +ICU4J, (under analysis/icu) is licensed under an MIT styles license +and Copyright (c) 1995-2008 International Business Machines Corporation and others + +Some data files (under analysis/icu/src/data) are derived from Unicode data such +as the Unicode Character Database. See http://unicode.org/copyright.html for more +details. + +Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is +BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/ + +The levenshtein automata tables (under core/src/java/org/apache/lucene/util/automaton) were +automatically generated with the moman/finenight FSA library, created by +Jean-Philippe Barrette-LaPierre. This library is available under an MIT license, +see http://sites.google.com/site/rrettesite/moman and +http://bitbucket.org/jpbarrette/moman/overview/ + +The class org.apache.lucene.util.WeakIdentityMap was derived from +the Apache CXF project and is Apache License 2.0. + +The class org.apache.lucene.util.compress.LZ4 is a Java rewrite of the LZ4 +compression library (https://github.com/lz4/lz4/tree/dev/lib) that is licensed +under the 2-clause BSD license. +(https://opensource.org/licenses/bsd-license.php) + +The Google Code Prettify is Apache License 2.0. +See http://code.google.com/p/google-code-prettify/ + +This product includes code (JaspellTernarySearchTrie) from Java Spelling Checkin +g Package (jaspell): http://jaspell.sourceforge.net/ +License: The BSD License (http://www.opensource.org/licenses/bsd-license.php) + +The snowball stemmers in + analysis/common/src/java/net/sf/snowball +were developed by Martin Porter and Richard Boulton. +The snowball stopword lists in + analysis/common/src/resources/org/apache/lucene/analysis/snowball +were developed by Martin Porter and Richard Boulton. +The full snowball package is available from + https://snowballstem.org/ + +The KStem stemmer in + analysis/common/src/org/apache/lucene/analysis/en +was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst) +under the BSD-license. + +The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default +stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: +analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt +See http://members.unine.ch/jacques.savoy/clef/index.html. + +The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers +(common) are based on BSD-licensed reference implementations created by Jacques Savoy and +Ljiljana Dolamic. These files reside in: +analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java + +The Stempel analyzer (stempel) includes BSD-licensed software developed +by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil, +and Edmond Nolan. + +The Polish analyzer (stempel) comes with a default +stopword list that is BSD-licensed created by the Carrot2 project. The file resides +in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt. +See https://github.com/carrot2/carrot2. + +The SmartChineseAnalyzer source code (smartcn) was +provided by Xiaoping Gao and copyright 2009 by www.imdict.net. + +WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/) +is derived from Unicode data such as the Unicode Character Database. +See http://unicode.org/copyright.html for more details. + +The Morfologik analyzer (morfologik) includes BSD-licensed software +developed by Dawid Weiss and Marcin Miłkowski +(https://github.com/morfologik/morfologik-stemming) and uses +data from the BSD-licensed dictionary of Polish (SGJP, http://sgjp.pl/morfeusz/). + +=========================================================================== +Kuromoji Japanese Morphological Analyzer - Apache Lucene Integration +=========================================================================== + +This software includes a binary and/or source version of data from + + mecab-ipadic-2.7.0-20070801 + +which can be obtained from + + http://atilika.com/releases/mecab-ipadic/mecab-ipadic-2.7.0-20070801.tar.gz + +or + + http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz + +=========================================================================== +mecab-ipadic-2.7.0-20070801 Notice +=========================================================================== + +Nara Institute of Science and Technology (NAIST), +the copyright holders, disclaims all warranties with regard to this +software, including all implied warranties of merchantability and +fitness, in no event shall NAIST be liable for +any special, indirect or consequential damages or any damages +whatsoever resulting from loss of use, data or profits, whether in an +action of contract, negligence or other tortuous action, arising out +of or in connection with the use or performance of this software. + +A large portion of the dictionary entries +originate from ICOT Free Software. The following conditions for ICOT +Free Software applies to the current dictionary as well. + +Each User may also freely distribute the Program, whether in its +original form or modified, to any third party or parties, PROVIDED +that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear +on, or be attached to, the Program, which is distributed substantially +in the same form as set out herein and that such intended +distribution, if actually made, will neither violate or otherwise +contravene any of the laws and regulations of the countries having +jurisdiction over the User or the intended distribution itself. + +NO WARRANTY + +The program was produced on an experimental basis in the course of the +research and development conducted during the project and is provided +to users as so produced on an experimental basis. Accordingly, the +program is provided without any warranty whatsoever, whether express, +implied, statutory or otherwise. The term "warranty" used herein +includes, but is not limited to, any warranty of the quality, +performance, merchantability and fitness for a particular purpose of +the program and the nonexistence of any infringement or violation of +any right of any third party. + +Each user of the program will agree and understand, and be deemed to +have agreed and understood, that there is no warranty whatsoever for +the program and, accordingly, the entire risk arising from or +otherwise connected with the program is assumed by the user. + +Therefore, neither ICOT, the copyright holder, or any other +organization that participated in or was otherwise related to the +development of the program and their respective officials, directors, +officers and other employees shall be held liable for any and all +damages, including, without limitation, general, special, incidental +and consequential damages, arising out of or otherwise in connection +with the use or inability to use the program or any product, material +or result produced or otherwise obtained by using the program, +regardless of whether they have been advised of, or otherwise had +knowledge of, the possibility of such damages at any time during the +project or thereafter. Each user will be deemed to have agreed to the +foregoing by his or her commencement of use of the program. The term +"use" as used herein includes, but is not limited to, the use, +modification, copying and distribution of the program and the +production of secondary products from the program. + +In the case where the program, whether in its original form or +modified, was distributed or delivered to or received by a user from +any person, organization or entity other than ICOT, unless it makes or +grants independently of ICOT any specific warranty to the user in +writing, such person, organization or entity, will also be exempted +from and not be held liable to the user for any such damages as noted +above as far as the program is concerned. + +=========================================================================== +Nori Korean Morphological Analyzer - Apache Lucene Integration +=========================================================================== + +This software includes a binary and/or source version of data from + + mecab-ko-dic-2.1.1-20180720 + +which can be obtained from + + https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.1.1-20180720.tar.gz diff --git a/lucene/sandbox/build.gradle b/lucene/sandbox/build.gradle index 72762fe1c3d2..6d225fd78ba4 100644 --- a/lucene/sandbox/build.gradle +++ b/lucene/sandbox/build.gradle @@ -19,9 +19,16 @@ apply plugin: 'java-library' description = 'Various third party contributions and new ideas' +repositories { + mavenLocal() +} + + dependencies { moduleApi project(':lucene:core') moduleApi project(':lucene:queries') moduleApi project(':lucene:facet') moduleTestImplementation project(':lucene:test-framework') + moduleImplementation deps.commons.lang3 + moduleImplementation deps.cuvs } diff --git a/lucene/sandbox/src/java/module-info.java b/lucene/sandbox/src/java/module-info.java index f40a05af433a..59e89cfd0bf0 100644 --- a/lucene/sandbox/src/java/module-info.java +++ b/lucene/sandbox/src/java/module-info.java @@ -20,6 +20,8 @@ requires org.apache.lucene.core; requires org.apache.lucene.queries; requires org.apache.lucene.facet; + requires java.logging; + requires com.nvidia.cuvs; exports org.apache.lucene.payloads; exports org.apache.lucene.sandbox.codecs.idversion; @@ -34,7 +36,12 @@ exports org.apache.lucene.sandbox.facet.iterators; exports org.apache.lucene.sandbox.facet.cutters; exports org.apache.lucene.sandbox.facet.labels; + exports org.apache.lucene.sandbox.vectorsearch; provides org.apache.lucene.codecs.PostingsFormat with org.apache.lucene.sandbox.codecs.idversion.IDVersionPostingsFormat; + provides org.apache.lucene.codecs.KnnVectorsFormat with + org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat; + provides com.nvidia.cuvs.spi.CuVSServiceProvider with + org.apache.lucene.sandbox.vectorsearch.FilterCuVSServiceProvider; } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java new file mode 100644 index 000000000000..c3ddc809c4d3 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSCodec.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import com.nvidia.cuvs.LibraryException; +import java.util.logging.Logger; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.FilterCodec; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.lucene101.Lucene101Codec; +import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.IndexType; +import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; + +/** CuVS based codec for GPU based vector search */ +public class CuVSCodec extends FilterCodec { + + public CuVSCodec() { + this("CuVSCodec", new Lucene101Codec()); + } + + public CuVSCodec(String name, Codec delegate) { + super(name, delegate); + KnnVectorsFormat format; + try { + format = new CuVSVectorsFormat(1, 128, 64, MergeStrategy.NON_TRIVIAL_MERGE, IndexType.CAGRA); + setKnnFormat(format); + } catch (LibraryException ex) { + Logger log = Logger.getLogger(CuVSCodec.class.getName()); + log.severe("Couldn't load native library, possible classloader issue. " + ex.getMessage()); + } + } + + KnnVectorsFormat knnFormat = null; + + @Override + public KnnVectorsFormat knnVectorsFormat() { + return knnFormat; + } + + public void setKnnFormat(KnnVectorsFormat format) { + this.knnFormat = format; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSFieldWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSFieldWriter.java new file mode 100644 index 000000000000..61b8f0879202 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSFieldWriter.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; +import java.util.List; +import org.apache.lucene.codecs.KnnFieldVectorsWriter; +import org.apache.lucene.codecs.hnsw.FlatFieldVectorsWriter; +import org.apache.lucene.index.DocsWithFieldSet; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.util.RamUsageEstimator; + +/** CuVS based fields writer */ +/*package-private*/ class CuVSFieldWriter extends KnnFieldVectorsWriter { + + private static final long SHALLOW_SIZE = + RamUsageEstimator.shallowSizeOfInstance(CuVSFieldWriter.class); + + private final FieldInfo fieldInfo; + private final FlatFieldVectorsWriter flatFieldVectorsWriter; + private int lastDocID = -1; + + public CuVSFieldWriter( + FieldInfo fieldInfo, FlatFieldVectorsWriter flatFieldVectorsWriter) { + this.fieldInfo = fieldInfo; + this.flatFieldVectorsWriter = flatFieldVectorsWriter; + } + + @Override + public void addValue(int docID, float[] vectorValue) throws IOException { + if (docID == lastDocID) { + throw new IllegalArgumentException( + "VectorValuesField \"" + + fieldInfo.name + + "\" appears more than once in this document (only one value is allowed per field)"); + } + flatFieldVectorsWriter.addValue(docID, vectorValue); + } + + List getVectors() { + return flatFieldVectorsWriter.getVectors(); + } + + FieldInfo fieldInfo() { + return fieldInfo; + } + + DocsWithFieldSet getDocsWithFieldSet() { + return flatFieldVectorsWriter.getDocsWithFieldSet(); + } + + @Override + public float[] copyValue(float[] vectorValue) { + throw new UnsupportedOperationException(); + } + + @Override + public long ramBytesUsed() { + return SHALLOW_SIZE + flatFieldVectorsWriter.ramBytesUsed(); + } + + @Override + public String toString() { + return "CuVSFieldWriter[field name=" + fieldInfo.name + ", number=" + fieldInfo.number + "]"; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java new file mode 100644 index 000000000000..d0cfe86d708e --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSIndex.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsReader.handleThrowable; + +import com.nvidia.cuvs.BruteForceIndex; +import com.nvidia.cuvs.CagraIndex; +import com.nvidia.cuvs.HnswIndex; +import java.io.Closeable; +import java.io.IOException; +import java.util.Objects; + +/** This class holds references to the actual CuVS Index (Cagra, Brute force, etc.) */ +public class CuVSIndex implements Closeable { + private final CagraIndex cagraIndex; + private final BruteForceIndex bruteforceIndex; + private final HnswIndex hnswIndex; + + private int maxDocs; + private String fieldName; + private String segmentName; + private volatile boolean closed; + + public CuVSIndex( + String segmentName, + String fieldName, + CagraIndex cagraIndex, + int maxDocs, + BruteForceIndex bruteforceIndex) { + this.cagraIndex = Objects.requireNonNull(cagraIndex); + this.bruteforceIndex = Objects.requireNonNull(bruteforceIndex); + this.fieldName = Objects.requireNonNull(fieldName); + this.segmentName = Objects.requireNonNull(segmentName); + if (maxDocs < 0) { + throw new IllegalArgumentException("negative maxDocs:" + maxDocs); + } + this.maxDocs = maxDocs; + this.hnswIndex = null; // TODO: + } + + public CuVSIndex(CagraIndex cagraIndex, BruteForceIndex bruteforceIndex, HnswIndex hnswIndex) { + this.cagraIndex = cagraIndex; + this.bruteforceIndex = bruteforceIndex; + this.hnswIndex = hnswIndex; + } + + public CagraIndex getCagraIndex() { + ensureOpen(); + return cagraIndex; + } + + public BruteForceIndex getBruteforceIndex() { + ensureOpen(); + return bruteforceIndex; + } + + public HnswIndex getHNSWIndex() { + ensureOpen(); + return hnswIndex; + } + + public String getFieldName() { + return fieldName; + } + + public String getSegmentName() { + return segmentName; + } + + public int getMaxDocs() { + return maxDocs; + } + + private void ensureOpen() { + if (closed) { + throw new IllegalStateException("index is closed"); + } + } + + @Override + public void close() throws IOException { + if (closed) { + return; + } + closed = true; + destroyIndices(); + } + + private void destroyIndices() throws IOException { + try { + if (cagraIndex != null) { + cagraIndex.destroyIndex(); + } + if (bruteforceIndex != null) { + bruteforceIndex.destroyIndex(); + } + if (hnswIndex != null) { + hnswIndex.destroyIndex(); + } + } catch (Throwable t) { + handleThrowable(t); + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java new file mode 100644 index 000000000000..2f6c636590ef --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSKnnFloatVectorQuery.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.KnnFloatVectorQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.knn.KnnCollectorManager; +import org.apache.lucene.util.Bits; + +/** Query for CuVS */ +public class CuVSKnnFloatVectorQuery extends KnnFloatVectorQuery { + + private final int iTopK; + private final int searchWidth; + + public CuVSKnnFloatVectorQuery(String field, float[] target, int k, int iTopK, int searchWidth) { + super(field, target, k); + this.iTopK = iTopK; + this.searchWidth = searchWidth; + } + + @Override + protected TopDocs approximateSearch( + LeafReaderContext context, + Bits acceptDocs, + int visitedLimit, + KnnCollectorManager knnCollectorManager) + throws IOException { + + PerLeafCuVSKnnCollector results = new PerLeafCuVSKnnCollector(k, iTopK, searchWidth); + + LeafReader reader = context.reader(); + reader.searchNearestVectors(field, this.getTargetCopy(), results, null); + return results.topDocs(); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java new file mode 100644 index 000000000000..9b12cdf61012 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSSegmentFile.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; +import java.io.OutputStream; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; +import java.util.logging.Logger; +import java.util.zip.Deflater; +import java.util.zip.ZipEntry; +import java.util.zip.ZipOutputStream; + +/** Methods to deal with a CuVS composite file inside a segment */ +/*package-private*/ class CuVSSegmentFile implements AutoCloseable { + private final ZipOutputStream zos; + + private Set filesAdded = new HashSet(); + + public CuVSSegmentFile(OutputStream out) { + zos = new ZipOutputStream(out); + zos.setLevel(Deflater.NO_COMPRESSION); + } + + protected Logger log = Logger.getLogger(getClass().getName()); + + public void addFile(String name, byte[] bytes) throws IOException { + /*log.info( + "Writing the file: " + + name + + ", size=" + + bytes.length);*/ + ZipEntry indexFileZipEntry = new ZipEntry(name); + zos.putNextEntry(indexFileZipEntry); + zos.write(bytes, 0, bytes.length); + zos.closeEntry(); + filesAdded.add(name); + } + + public Set getFilesAdded() { + return Collections.unmodifiableSet(filesAdded); + } + + @Override + public void close() throws IOException { + zos.close(); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java new file mode 100644 index 000000000000..e0d4678aa5fe --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsFormat.java @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import com.nvidia.cuvs.CuVSResources; +import com.nvidia.cuvs.LibraryException; +import java.io.IOException; +import java.util.logging.Logger; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.hnsw.DefaultFlatVectorScorer; +import org.apache.lucene.codecs.hnsw.FlatVectorsFormat; +import org.apache.lucene.codecs.lucene99.Lucene99FlatVectorsFormat; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.IndexType; +import org.apache.lucene.sandbox.vectorsearch.CuVSVectorsWriter.MergeStrategy; + +/** CuVS based KnnVectorsFormat for GPU acceleration */ +public class CuVSVectorsFormat extends KnnVectorsFormat { + + private static final Logger LOG = Logger.getLogger(CuVSVectorsFormat.class.getName()); + + // TODO: fix Lucene version in name, to the final targeted release, if any + static final String CUVS_META_CODEC_NAME = "Lucene102CuVSVectorsFormatMeta"; + static final String CUVS_META_CODEC_EXT = "vemc"; // ""cagmf"; + static final String CUVS_INDEX_CODEC_NAME = "Lucene102CuVSVectorsFormatIndex"; + static final String CUVS_INDEX_EXT = "vcag"; + + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + public static final int DEFAULT_WRITER_THREADS = 32; + public static final int DEFAULT_INTERMEDIATE_GRAPH_DEGREE = 128; + public static final int DEFAULT_GRAPH_DEGREE = 64; + public static final MergeStrategy DEFAULT_MERGE_STRATEGY = MergeStrategy.NON_TRIVIAL_MERGE; + public static final IndexType DEFAULT_INDEX_TYPE = IndexType.CAGRA; + + static CuVSResources resources = cuVSResourcesOrNull(); + + /** The format for storing, reading, and merging raw vectors on disk. */ + private static final FlatVectorsFormat flatVectorsFormat = + new Lucene99FlatVectorsFormat(DefaultFlatVectorScorer.INSTANCE); + + final int maxDimensions = 4096; + final int cuvsWriterThreads; + final int intGraphDegree; + final int graphDegree; + final MergeStrategy mergeStrategy; + final CuVSVectorsWriter.IndexType indexType; // the index type to build, when writing + + /** + * Creates a CuVSVectorsFormat, with default values. + * + * @throws LibraryException if the native library fails to load + */ + public CuVSVectorsFormat() { + this( + DEFAULT_WRITER_THREADS, + DEFAULT_INTERMEDIATE_GRAPH_DEGREE, + DEFAULT_GRAPH_DEGREE, + DEFAULT_MERGE_STRATEGY, + DEFAULT_INDEX_TYPE); + } + + /** + * Creates a CuVSVectorsFormat, with the given threads, graph degree, etc. + * + * @throws LibraryException if the native library fails to load + */ + public CuVSVectorsFormat( + int cuvsWriterThreads, + int intGraphDegree, + int graphDegree, + MergeStrategy mergeStrategy, + IndexType indexType) { + super("CuVSVectorsFormat"); + this.mergeStrategy = mergeStrategy; + this.cuvsWriterThreads = cuvsWriterThreads; + this.intGraphDegree = intGraphDegree; + this.graphDegree = graphDegree; + this.indexType = indexType; + } + + private static CuVSResources cuVSResourcesOrNull() { + try { + resources = CuVSResources.create(); + return resources; + } catch (UnsupportedOperationException uoe) { + LOG.warning("cuvs is not supported on this platform or java version: " + uoe.getMessage()); + } catch (Throwable t) { + if (t instanceof ExceptionInInitializerError ex) { + t = ex.getCause(); + } + LOG.warning("Exception occurred during creation of cuvs resources. " + t); + } + return null; + } + + /** Tells whether the platform supports cuvs. */ + public static boolean supported() { + return resources != null; + } + + private static void checkSupported() { + if (!supported()) { + throw new UnsupportedOperationException(); + } + } + + @Override + public CuVSVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException { + checkSupported(); + var flatWriter = flatVectorsFormat.fieldsWriter(state); + return new CuVSVectorsWriter( + state, + cuvsWriterThreads, + intGraphDegree, + graphDegree, + mergeStrategy, + indexType, + resources, + flatWriter); + } + + @Override + public CuVSVectorsReader fieldsReader(SegmentReadState state) throws IOException { + checkSupported(); + var flatReader = flatVectorsFormat.fieldsReader(state); + return new CuVSVectorsReader(state, resources, flatReader); + } + + @Override + public int getMaxDimensions(String fieldName) { + return maxDimensions; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder("CuVSVectorsFormat("); + sb.append("cuvsWriterThreads=").append(cuvsWriterThreads); + sb.append("intGraphDegree=").append(intGraphDegree); + sb.append("graphDegree=").append(graphDegree); + sb.append("mergeStrategy=").append(mergeStrategy); + sb.append("resources=").append(resources); + sb.append(")"); + return sb.toString(); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java new file mode 100644 index 000000000000..cfb59121e36e --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsReader.java @@ -0,0 +1,487 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_INDEX_CODEC_NAME; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_INDEX_EXT; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_META_CODEC_EXT; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_META_CODEC_NAME; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.VERSION_CURRENT; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.VERSION_START; + +import com.nvidia.cuvs.BruteForceIndex; +import com.nvidia.cuvs.BruteForceQuery; +import com.nvidia.cuvs.CagraIndex; +import com.nvidia.cuvs.CagraQuery; +import com.nvidia.cuvs.CagraSearchParams; +import com.nvidia.cuvs.CuVSResources; +import com.nvidia.cuvs.HnswIndex; +import com.nvidia.cuvs.HnswIndexParams; +import java.io.IOException; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.logging.Logger; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.KnnVectorsReader; +import org.apache.lucene.codecs.hnsw.FlatVectorsReader; +import org.apache.lucene.index.ByteVectorValues; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.VectorEncoding; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.internal.hppc.IntObjectHashMap; +import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.ReadAdvice; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.hnsw.IntToIntFunction; + +/** KnnVectorsReader instance associated with CuVS format */ +public class CuVSVectorsReader extends KnnVectorsReader { + + @SuppressWarnings("unused") + private static final Logger log = Logger.getLogger(CuVSVectorsReader.class.getName()); + + private final CuVSResources resources; + private final FlatVectorsReader flatVectorsReader; // for reading the raw vectors + private final FieldInfos fieldInfos; + private final IntObjectHashMap fields; + private final IntObjectHashMap cuvsIndices; + private final IndexInput cuvsIndexInput; + + public CuVSVectorsReader( + SegmentReadState state, CuVSResources resources, FlatVectorsReader flatReader) + throws IOException { + this.resources = resources; + this.flatVectorsReader = flatReader; + this.fieldInfos = state.fieldInfos; + this.fields = new IntObjectHashMap<>(); + + String metaFileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, CUVS_META_CODEC_EXT); + boolean success = false; + int versionMeta = -1; + try (ChecksumIndexInput meta = state.directory.openChecksumInput(metaFileName)) { + Throwable priorException = null; + try { + versionMeta = + CodecUtil.checkIndexHeader( + meta, + CUVS_META_CODEC_NAME, + VERSION_START, + VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + readFields(meta); + } catch (Throwable exception) { + priorException = exception; + } finally { + CodecUtil.checkFooter(meta, priorException); + } + var ioContext = state.context.withReadAdvice(ReadAdvice.SEQUENTIAL); + cuvsIndexInput = openCuVSInput(state, versionMeta, ioContext); + cuvsIndices = loadCuVSIndices(); + success = true; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(this); + } + } + } + + private static IndexInput openCuVSInput( + SegmentReadState state, int versionMeta, IOContext context) throws IOException { + String fileName = + IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, CUVS_INDEX_EXT); + IndexInput in = state.directory.openInput(fileName, context); + boolean success = false; + try { + int versionVectorData = + CodecUtil.checkIndexHeader( + in, + CUVS_INDEX_CODEC_NAME, + VERSION_START, + VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + checkVersion(versionMeta, versionVectorData, in); + CodecUtil.retrieveChecksum(in); + success = true; + return in; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(in); + } + } + } + + private void validateFieldEntry(FieldInfo info, FieldEntry fieldEntry) { + int dimension = info.getVectorDimension(); + if (dimension != fieldEntry.dims()) { + throw new IllegalStateException( + "Inconsistent vector dimension for field=\"" + + info.name + + "\"; " + + dimension + + " != " + + fieldEntry.dims()); + } + } + + private void readFields(ChecksumIndexInput meta) throws IOException { + for (int fieldNumber = meta.readInt(); fieldNumber != -1; fieldNumber = meta.readInt()) { + FieldInfo info = fieldInfos.fieldInfo(fieldNumber); + if (info == null) { + throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta); + } + FieldEntry fieldEntry = readField(meta, info); + validateFieldEntry(info, fieldEntry); + fields.put(info.number, fieldEntry); + } + } + + // List of vector similarity functions. This list is defined here, in order + // to avoid an undesirable dependency on the declaration and order of values + // in VectorSimilarityFunction. The list values and order must be identical + // to that of {@link o.a.l.c.l.Lucene94FieldInfosFormat#SIMILARITY_FUNCTIONS}. + static final List SIMILARITY_FUNCTIONS = + List.of( + VectorSimilarityFunction.EUCLIDEAN, + VectorSimilarityFunction.DOT_PRODUCT, + VectorSimilarityFunction.COSINE, + VectorSimilarityFunction.MAXIMUM_INNER_PRODUCT); + + static VectorSimilarityFunction readSimilarityFunction(DataInput input) throws IOException { + int i = input.readInt(); + if (i < 0 || i >= SIMILARITY_FUNCTIONS.size()) { + throw new IllegalArgumentException("invalid distance function: " + i); + } + return SIMILARITY_FUNCTIONS.get(i); + } + + static VectorEncoding readVectorEncoding(DataInput input) throws IOException { + int encodingId = input.readInt(); + if (encodingId < 0 || encodingId >= VectorEncoding.values().length) { + throw new CorruptIndexException("Invalid vector encoding id: " + encodingId, input); + } + return VectorEncoding.values()[encodingId]; + } + + private FieldEntry readField(IndexInput input, FieldInfo info) throws IOException { + VectorEncoding vectorEncoding = readVectorEncoding(input); + VectorSimilarityFunction similarityFunction = readSimilarityFunction(input); + if (similarityFunction != info.getVectorSimilarityFunction()) { + throw new IllegalStateException( + "Inconsistent vector similarity function for field=\"" + + info.name + + "\"; " + + similarityFunction + + " != " + + info.getVectorSimilarityFunction()); + } + return FieldEntry.readEntry(input, vectorEncoding, info.getVectorSimilarityFunction()); + } + + private FieldEntry getFieldEntry(String field, VectorEncoding expectedEncoding) { + final FieldInfo info = fieldInfos.fieldInfo(field); + final FieldEntry fieldEntry; + if (info == null || (fieldEntry = fields.get(info.number)) == null) { + throw new IllegalArgumentException("field=\"" + field + "\" not found"); + } + if (fieldEntry.vectorEncoding != expectedEncoding) { + throw new IllegalArgumentException( + "field=\"" + + field + + "\" is encoded as: " + + fieldEntry.vectorEncoding + + " expected: " + + expectedEncoding); + } + return fieldEntry; + } + + private IntObjectHashMap loadCuVSIndices() throws IOException { + var indices = new IntObjectHashMap(); + for (var e : fields) { + var fieldEntry = e.value; + int fieldNumber = e.key; + var cuvsIndex = loadCuVSIndex(fieldEntry); + indices.put(fieldNumber, cuvsIndex); + } + return indices; + } + + private CuVSIndex loadCuVSIndex(FieldEntry fieldEntry) throws IOException { + CagraIndex cagraIndex = null; + BruteForceIndex bruteForceIndex = null; + HnswIndex hnswIndex = null; + + try { + long len = fieldEntry.cagraIndexLength(); + if (len > 0) { + long off = fieldEntry.cagraIndexOffset(); + try (var slice = cuvsIndexInput.slice("cagra index", off, len); + var in = new IndexInputInputStream(slice)) { + cagraIndex = CagraIndex.newBuilder(resources).from(in).build(); + } + } + + len = fieldEntry.bruteForceIndexLength(); + if (len > 0) { + long off = fieldEntry.bruteForceIndexOffset(); + try (var slice = cuvsIndexInput.slice("bf index", off, len); + var in = new IndexInputInputStream(slice)) { + bruteForceIndex = BruteForceIndex.newBuilder(resources).from(in).build(); + } + } + + len = fieldEntry.hnswIndexLength(); + if (len > 0) { + long off = fieldEntry.hnswIndexOffset(); + try (var slice = cuvsIndexInput.slice("hnsw index", off, len); + var in = new IndexInputInputStream(slice)) { + var params = new HnswIndexParams.Builder().build(); + hnswIndex = HnswIndex.newBuilder(resources).withIndexParams(params).from(in).build(); + } + } + } catch (Throwable t) { + handleThrowable(t); + } + return new CuVSIndex(cagraIndex, bruteForceIndex, hnswIndex); + } + + @Override + public void close() throws IOException { + var closeableStream = + Stream.concat( + Stream.of(flatVectorsReader, cuvsIndexInput), + stream(cuvsIndices.values().iterator()).map(cursor -> cursor.value)); + IOUtils.close(closeableStream::iterator); + } + + static Stream stream(Iterator iterator) { + return StreamSupport.stream(((Iterable) () -> iterator).spliterator(), false); + } + + @Override + public void checkIntegrity() throws IOException { + // TODO: Pending implementation + } + + @Override + public FloatVectorValues getFloatVectorValues(String field) throws IOException { + return flatVectorsReader.getFloatVectorValues(field); + } + + @Override + public ByteVectorValues getByteVectorValues(String field) { + throw new UnsupportedOperationException("byte vectors not supported"); + } + + /** Native float to float function */ + public interface FloatToFloatFunction { + float apply(float v); + } + + static long[] bitsToLongArray(Bits bits) { + if (bits instanceof FixedBitSet fixedBitSet) { + return fixedBitSet.getBits(); + } else { + return FixedBitSet.copyOf(bits).getBits(); + } + } + + static FloatToFloatFunction getScoreNormalizationFunc(VectorSimilarityFunction sim) { + // TODO: check for different similarities + return score -> (1f / (1f + score)); + } + + // This is a hack - https://github.com/rapidsai/cuvs/issues/696 + static final int FILTER_OVER_SAMPLE = 10; + + @Override + public void search(String field, float[] target, KnnCollector knnCollector, Bits acceptDocs) + throws IOException { + var fieldEntry = getFieldEntry(field, VectorEncoding.FLOAT32); + if (fieldEntry.count() == 0 || knnCollector.k() == 0) { + return; + } + + var fieldNumber = fieldInfos.fieldInfo(field).number; + // log.info("fieldNumber=" + fieldNumber + ", fieldEntry.count()=" + fieldEntry.count()); + + CuVSIndex cuvsIndex = cuvsIndices.get(fieldNumber); + if (cuvsIndex == null) { + throw new IllegalStateException("not index found for field:" + field); + } + + int collectorTopK = knnCollector.k(); + if (acceptDocs != null) { + collectorTopK = knnCollector.k() * FILTER_OVER_SAMPLE; + } + final int topK = Math.min(collectorTopK, fieldEntry.count()); + assert topK > 0 : "Expected topK > 0, got:" + topK; + + Map result; + if (knnCollector.k() <= 1024 && cuvsIndex.getCagraIndex() != null) { + // log.info("searching cagra index"); + CagraSearchParams searchParams = + new CagraSearchParams.Builder(resources) + .withItopkSize(topK) // TODO: params + .withSearchWidth(1) + .build(); + + var query = + new CagraQuery.Builder() + .withTopK(topK) + .withSearchParams(searchParams) + // we don't use ord to doc mapping, https://github.com/rapidsai/cuvs/issues/699 + .withMapping(null) + .withQueryVectors(new float[][] {target}) + .build(); + + CagraIndex cagraIndex = cuvsIndex.getCagraIndex(); + List> searchResult = null; + try { + searchResult = cagraIndex.search(query).getResults(); + } catch (Throwable t) { + handleThrowable(t); + } + // List expected to have only one entry because of single query "target". + assert searchResult.size() == 1; + result = searchResult.getFirst(); + } else { + BruteForceIndex bruteforceIndex = cuvsIndex.getBruteforceIndex(); + assert bruteforceIndex != null; + // log.info("searching brute index, with actual topK=" + topK); + var queryBuilder = + new BruteForceQuery.Builder().withQueryVectors(new float[][] {target}).withTopK(topK); + BruteForceQuery query = queryBuilder.build(); + + List> searchResult = null; + try { + searchResult = bruteforceIndex.search(query).getResults(); + } catch (Throwable t) { + handleThrowable(t); + } + assert searchResult.size() == 1; + result = searchResult.getFirst(); + } + assert result != null; + + final var rawValues = flatVectorsReader.getFloatVectorValues(field); + final Bits acceptedOrds = rawValues.getAcceptOrds(acceptDocs); + final var ordToDocFunction = (IntToIntFunction) rawValues::ordToDoc; + final var scoreCorrectionFunction = getScoreNormalizationFunc(fieldEntry.similarityFunction); + + for (var entry : result.entrySet()) { + int ord = entry.getKey(); + float score = entry.getValue(); + if (acceptedOrds == null || acceptedOrds.get(ord)) { + if (knnCollector.earlyTerminated()) { + break; + } + assert ord >= 0 : "unexpected ord: " + ord; + int doc = ordToDocFunction.apply(ord); + float correctedScore = scoreCorrectionFunction.apply(score); + knnCollector.incVisitedCount(1); + knnCollector.collect(doc, correctedScore); + } + } + } + + @Override + public void search(String field, byte[] target, KnnCollector knnCollector, Bits acceptDocs) + throws IOException { + throw new UnsupportedOperationException("byte vectors not supported"); + } + + record FieldEntry( + VectorEncoding vectorEncoding, + VectorSimilarityFunction similarityFunction, + int dims, + int count, + long cagraIndexOffset, + long cagraIndexLength, + long bruteForceIndexOffset, + long bruteForceIndexLength, + long hnswIndexOffset, + long hnswIndexLength) { + + static FieldEntry readEntry( + IndexInput input, + VectorEncoding vectorEncoding, + VectorSimilarityFunction similarityFunction) + throws IOException { + var dims = input.readInt(); + var count = input.readInt(); + var cagraIndexOffset = input.readVLong(); + var cagraIndexLength = input.readVLong(); + var bruteForceIndexOffset = input.readVLong(); + var bruteForceIndexLength = input.readVLong(); + var hnswIndexOffset = input.readVLong(); + var hnswIndexLength = input.readVLong(); + return new FieldEntry( + vectorEncoding, + similarityFunction, + dims, + count, + cagraIndexOffset, + cagraIndexLength, + bruteForceIndexOffset, + bruteForceIndexLength, + hnswIndexOffset, + hnswIndexLength); + } + } + + static void checkVersion(int versionMeta, int versionVectorData, IndexInput in) + throws CorruptIndexException { + if (versionMeta != versionVectorData) { + throw new CorruptIndexException( + "Format versions mismatch: meta=" + + versionMeta + + ", " + + CUVS_META_CODEC_NAME + + "=" + + versionVectorData, + in); + } + } + + static void handleThrowable(Throwable t) throws IOException { + switch (t) { + case IOException ioe -> throw ioe; + case Error error -> throw error; + case RuntimeException re -> throw re; + case null, default -> throw new RuntimeException("UNEXPECTED: exception type", t); + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java new file mode 100644 index 000000000000..61f77ee26e7c --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/CuVSVectorsWriter.java @@ -0,0 +1,505 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.SIMILARITY_FUNCTIONS; +import static org.apache.lucene.index.VectorEncoding.FLOAT32; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_INDEX_CODEC_NAME; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_INDEX_EXT; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_META_CODEC_EXT; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.CUVS_META_CODEC_NAME; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat.VERSION_CURRENT; +import static org.apache.lucene.sandbox.vectorsearch.CuVSVectorsReader.handleThrowable; +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; +import static org.apache.lucene.util.RamUsageEstimator.shallowSizeOfInstance; + +import com.nvidia.cuvs.BruteForceIndex; +import com.nvidia.cuvs.BruteForceIndexParams; +import com.nvidia.cuvs.CagraIndex; +import com.nvidia.cuvs.CagraIndexParams; +import com.nvidia.cuvs.CagraIndexParams.CagraGraphBuildAlgo; +import com.nvidia.cuvs.CuVSResources; +import java.io.IOException; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; +import java.util.logging.Logger; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.KnnFieldVectorsWriter; +import org.apache.lucene.codecs.KnnVectorsWriter; +import org.apache.lucene.codecs.hnsw.FlatFieldVectorsWriter; +import org.apache.lucene.codecs.hnsw.FlatVectorsWriter; +import org.apache.lucene.index.DocsWithFieldSet; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.KnnVectorValues; +import org.apache.lucene.index.MergeState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Sorter; +import org.apache.lucene.index.Sorter.DocMap; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.InfoStream; + +/** KnnVectorsWriter for CuVS, responsible for merge and flush of vectors into GPU */ +public class CuVSVectorsWriter extends KnnVectorsWriter { + + private static final long SHALLOW_RAM_BYTES_USED = shallowSizeOfInstance(CuVSVectorsWriter.class); + + @SuppressWarnings("unused") + private static final Logger log = Logger.getLogger(CuVSVectorsWriter.class.getName()); + + /** The name of the CUVS component for the info-stream * */ + public static final String CUVS_COMPONENT = "CUVS"; + + // The minimum number of vectors in the dataset required before + // we attempt to build a Cagra index + static final int MIN_CAGRA_INDEX_SIZE = 2; + + private final int cuvsWriterThreads; + private final int intGraphDegree; + private final int graphDegree; + + private final CuVSResources resources; + private final IndexType indexType; + + @SuppressWarnings("unused") + private final MergeStrategy mergeStrategy; + + private final FlatVectorsWriter flatVectorsWriter; // for writing the raw vectors + private final List fields = new ArrayList<>(); + private final IndexOutput meta, cuvsIndex; + private final InfoStream infoStream; + private boolean finished; + + /** Merge strategy used for CuVS */ + public enum MergeStrategy { + TRIVIAL_MERGE, + NON_TRIVIAL_MERGE + } + + /** The CuVS index Type. */ + public enum IndexType { + /** Builds a Cagra index. */ + CAGRA(true, false, false), + /** Builds a Brute Force index. */ + BRUTE_FORCE(false, true, false), + /** Builds an HSNW index - suitable for searching on CPU. */ + HNSW(false, false, true), + /** Builds a Cagra and a Brute Force index. */ + CAGRA_AND_BRUTE_FORCE(true, true, false); + private final boolean cagra, bruteForce, hnsw; + + IndexType(boolean cagra, boolean bruteForce, boolean hnsw) { + this.cagra = cagra; + this.bruteForce = bruteForce; + this.hnsw = hnsw; + } + + public boolean cagra() { + return cagra; + } + + public boolean bruteForce() { + return bruteForce; + } + + public boolean hnsw() { + return hnsw; + } + } + + public CuVSVectorsWriter( + SegmentWriteState state, + int cuvsWriterThreads, + int intGraphDegree, + int graphDegree, + MergeStrategy mergeStrategy, + IndexType indexType, + CuVSResources resources, + FlatVectorsWriter flatVectorsWriter) + throws IOException { + super(); + this.mergeStrategy = mergeStrategy; + this.indexType = indexType; + this.cuvsWriterThreads = cuvsWriterThreads; + this.intGraphDegree = intGraphDegree; + this.graphDegree = graphDegree; + this.resources = resources; + this.flatVectorsWriter = flatVectorsWriter; + this.infoStream = state.infoStream; + + String metaFileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, CUVS_META_CODEC_EXT); + String cagraFileName = + IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, CUVS_INDEX_EXT); + + boolean success = false; + try { + meta = state.directory.createOutput(metaFileName, state.context); + cuvsIndex = state.directory.createOutput(cagraFileName, state.context); + CodecUtil.writeIndexHeader( + meta, + CUVS_META_CODEC_NAME, + VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + CodecUtil.writeIndexHeader( + cuvsIndex, + CUVS_INDEX_CODEC_NAME, + VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + success = true; + } finally { + if (success == false) { + IOUtils.closeWhileHandlingException(this); + } + } + } + + @Override + public KnnFieldVectorsWriter addField(FieldInfo fieldInfo) throws IOException { + var encoding = fieldInfo.getVectorEncoding(); + if (encoding != FLOAT32) { + throw new IllegalArgumentException("expected float32, got:" + encoding); + } + var writer = Objects.requireNonNull(flatVectorsWriter.addField(fieldInfo)); + @SuppressWarnings("unchecked") + var flatWriter = (FlatFieldVectorsWriter) writer; + var cuvsFieldWriter = new CuVSFieldWriter(fieldInfo, flatWriter); + fields.add(cuvsFieldWriter); + return writer; + } + + static String indexMsg(int size, int... args) { + StringBuilder sb = new StringBuilder("cagra index params"); + sb.append(": size=").append(size); + sb.append(", intGraphDegree=").append(args[0]); + sb.append(", actualIntGraphDegree=").append(args[1]); + sb.append(", graphDegree=").append(args[2]); + sb.append(", actualGraphDegree=").append(args[3]); + return sb.toString(); + } + + private CagraIndexParams cagraIndexParams(int size) { + if (size < 2) { + // https://github.com/rapidsai/cuvs/issues/666 + throw new IllegalArgumentException("cagra index must be greater than 2"); + } + var minIntGraphDegree = Math.min(intGraphDegree, size - 1); + var minGraphDegree = Math.min(graphDegree, minIntGraphDegree); + // log.info(indexMsg(size, intGraphDegree, minIntGraphDegree, graphDegree, minGraphDegree)); + + return new CagraIndexParams.Builder() + .withNumWriterThreads(cuvsWriterThreads) + .withIntermediateGraphDegree(minIntGraphDegree) + .withGraphDegree(minGraphDegree) + .withCagraGraphBuildAlgo(CagraGraphBuildAlgo.NN_DESCENT) + .build(); + } + + static long nanosToMillis(long nanos) { + return Duration.ofNanos(nanos).toMillis(); + } + + private void info(String msg) { + if (infoStream.isEnabled(CUVS_COMPONENT)) { + infoStream.message(CUVS_COMPONENT, msg); + } + } + + private void writeCagraIndex(OutputStream os, float[][] vectors) throws Throwable { + if (vectors.length < 2) { + throw new IllegalArgumentException(vectors.length + " vectors, less than min [2] required"); + } + CagraIndexParams params = cagraIndexParams(vectors.length); + long startTime = System.nanoTime(); + var index = + CagraIndex.newBuilder(resources).withDataset(vectors).withIndexParams(params).build(); + long elapsedMillis = nanosToMillis(System.nanoTime() - startTime); + info("Cagra index created in " + elapsedMillis + "ms, with " + vectors.length + " vectors"); + Path tmpFile = Files.createTempFile(resources.tempDirectory(), "tmpindex", "cag"); + index.serialize(os, tmpFile); + index.destroyIndex(); + } + + private void writeBruteForceIndex(OutputStream os, float[][] vectors) throws Throwable { + BruteForceIndexParams params = + new BruteForceIndexParams.Builder() + .withNumWriterThreads(32) // TODO: Make this configurable later. + .build(); + long startTime = System.nanoTime(); + var index = + BruteForceIndex.newBuilder(resources).withIndexParams(params).withDataset(vectors).build(); + long elapsedMillis = nanosToMillis(System.nanoTime() - startTime); + info("bf index created in " + elapsedMillis + "ms, with " + vectors.length + " vectors"); + index.serialize(os); + index.destroyIndex(); + } + + private void writeHNSWIndex(OutputStream os, float[][] vectors) throws Throwable { + if (vectors.length < 2) { + throw new IllegalArgumentException(vectors.length + " vectors, less than min [2] required"); + } + CagraIndexParams indexParams = cagraIndexParams(vectors.length); + long startTime = System.nanoTime(); + var index = + CagraIndex.newBuilder(resources).withDataset(vectors).withIndexParams(indexParams).build(); + long elapsedMillis = nanosToMillis(System.nanoTime() - startTime); + info("HNSW index created in " + elapsedMillis + "ms, with " + vectors.length + " vectors"); + Path tmpFile = Files.createTempFile("tmpindex", "hnsw"); + index.serializeToHNSW(os, tmpFile); + index.destroyIndex(); + } + + @Override + public void flush(int maxDoc, DocMap sortMap) throws IOException { + flatVectorsWriter.flush(maxDoc, sortMap); + for (var field : fields) { + if (sortMap == null) { + writeField(field); + } else { + writeSortingField(field, sortMap); + } + } + } + + private void writeField(CuVSFieldWriter fieldData) throws IOException { + // TODO: Argh! https://github.com/rapidsai/cuvs/issues/698 + float[][] vectors = fieldData.getVectors().toArray(float[][]::new); + writeFieldInternal(fieldData.fieldInfo(), vectors); + } + + private void writeSortingField(CuVSFieldWriter fieldData, Sorter.DocMap sortMap) + throws IOException { + DocsWithFieldSet oldDocsWithFieldSet = fieldData.getDocsWithFieldSet(); + final int[] new2OldOrd = new int[oldDocsWithFieldSet.cardinality()]; // new ord to old ord + + mapOldOrdToNewOrd(oldDocsWithFieldSet, sortMap, null, new2OldOrd, null); + + // TODO: Argh! https://github.com/rapidsai/cuvs/issues/698 + // Also will be replaced with the cuVS merge api + float[][] oldVectors = fieldData.getVectors().toArray(float[][]::new); + float[][] newVectors = new float[oldVectors.length][]; + for (int i = 0; i < oldVectors.length; i++) { + newVectors[i] = oldVectors[new2OldOrd[i]]; + } + writeFieldInternal(fieldData.fieldInfo(), newVectors); + } + + private void writeFieldInternal(FieldInfo fieldInfo, float[][] vectors) throws IOException { + if (vectors.length == 0) { + writeEmpty(fieldInfo); + return; + } + long cagraIndexOffset, cagraIndexLength = 0L; + long bruteForceIndexOffset, bruteForceIndexLength = 0L; + long hnswIndexOffset, hnswIndexLength = 0L; + + // workaround for the minimum number of vectors for Cagra + IndexType indexType = + this.indexType.cagra() && vectors.length < MIN_CAGRA_INDEX_SIZE + ? IndexType.BRUTE_FORCE + : this.indexType; + + try { + cagraIndexOffset = cuvsIndex.getFilePointer(); + if (indexType.cagra()) { + try { + var cagraIndexOutputStream = new IndexOutputOutputStream(cuvsIndex); + writeCagraIndex(cagraIndexOutputStream, vectors); + } catch (Throwable t) { + handleThrowableWithIgnore(t, CANNOT_GENERATE_CAGRA); + // workaround for cuVS issue + indexType = IndexType.BRUTE_FORCE; + } + cagraIndexLength = cuvsIndex.getFilePointer() - cagraIndexOffset; + } + + bruteForceIndexOffset = cuvsIndex.getFilePointer(); + if (indexType.bruteForce()) { + var bruteForceIndexOutputStream = new IndexOutputOutputStream(cuvsIndex); + writeBruteForceIndex(bruteForceIndexOutputStream, vectors); + bruteForceIndexLength = cuvsIndex.getFilePointer() - bruteForceIndexOffset; + } + + hnswIndexOffset = cuvsIndex.getFilePointer(); + if (indexType.hnsw()) { + var hnswIndexOutputStream = new IndexOutputOutputStream(cuvsIndex); + if (vectors.length > MIN_CAGRA_INDEX_SIZE) { + try { + writeHNSWIndex(hnswIndexOutputStream, vectors); + } catch (Throwable t) { + handleThrowableWithIgnore(t, CANNOT_GENERATE_CAGRA); + } + } + hnswIndexLength = cuvsIndex.getFilePointer() - hnswIndexOffset; + } + + // StringBuilder sb = new StringBuilder("writeField "); + // sb.append(": fieldInfo.name=").append(fieldInfo.name); + // sb.append(", fieldInfo.number=").append(fieldInfo.number); + // sb.append(", size=").append(vectors.length); + // sb.append(", cagraIndexLength=").append(cagraIndexLength); + // sb.append(", bruteForceIndexLength=").append(bruteForceIndexLength); + // sb.append(", hnswIndexLength=").append(hnswIndexLength); + // log.info(sb.toString()); + + writeMeta( + fieldInfo, + vectors.length, + cagraIndexOffset, + cagraIndexLength, + bruteForceIndexOffset, + bruteForceIndexLength, + hnswIndexOffset, + hnswIndexLength); + } catch (Throwable t) { + handleThrowable(t); + } + } + + private void writeEmpty(FieldInfo fieldInfo) throws IOException { + writeMeta(fieldInfo, 0, 0L, 0L, 0L, 0L, 0L, 0L); + } + + private void writeMeta( + FieldInfo field, + int count, + long cagraIndexOffset, + long cagraIndexLength, + long bruteForceIndexOffset, + long bruteForceIndexLength, + long hnswIndexOffset, + long hnswIndexLength) + throws IOException { + meta.writeInt(field.number); + meta.writeInt(field.getVectorEncoding().ordinal()); + meta.writeInt(distFuncToOrd(field.getVectorSimilarityFunction())); + meta.writeInt(field.getVectorDimension()); + meta.writeInt(count); + meta.writeVLong(cagraIndexOffset); + meta.writeVLong(cagraIndexLength); + meta.writeVLong(bruteForceIndexOffset); + meta.writeVLong(bruteForceIndexLength); + meta.writeVLong(hnswIndexOffset); + meta.writeVLong(hnswIndexLength); + } + + static int distFuncToOrd(VectorSimilarityFunction func) { + for (int i = 0; i < SIMILARITY_FUNCTIONS.size(); i++) { + if (SIMILARITY_FUNCTIONS.get(i).equals(func)) { + return (byte) i; + } + } + throw new IllegalArgumentException("invalid distance function: " + func); + } + + // We currently ignore this, until cuVS supports tiered indices + private static final String CANNOT_GENERATE_CAGRA = + """ + Could not generate an intermediate CAGRA graph because the initial \ + kNN graph contains too many invalid or duplicated neighbor nodes. \ + This error can occur, for example, if too many overflows occur \ + during the norm computation between the dataset vectors\ + """; + + static void handleThrowableWithIgnore(Throwable t, String msg) throws IOException { + if (t.getMessage().contains(msg)) { + return; + } + handleThrowable(t); + } + + /** Copies the vector values into dst. Returns the actual number of vectors copied. */ + private static int getVectorData(FloatVectorValues floatVectorValues, float[][] dst) + throws IOException { + DocsWithFieldSet docsWithField = new DocsWithFieldSet(); + int count = 0; + KnnVectorValues.DocIndexIterator iter = floatVectorValues.iterator(); + for (int docV = iter.nextDoc(); docV != NO_MORE_DOCS; docV = iter.nextDoc()) { + assert iter.index() == count; + dst[iter.index()] = floatVectorValues.vectorValue(iter.index()); + docsWithField.add(docV); + count++; + } + return docsWithField.cardinality(); + } + + @Override + public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException { + flatVectorsWriter.mergeOneField(fieldInfo, mergeState); + try { + final FloatVectorValues mergedVectorValues = + switch (fieldInfo.getVectorEncoding()) { + case BYTE -> throw new AssertionError("bytes not supported"); + case FLOAT32 -> + KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, mergeState); + }; + + float[][] vectors = new float[mergedVectorValues.size()][mergedVectorValues.dimension()]; + int ret = getVectorData(mergedVectorValues, vectors); + if (ret < vectors.length) { + vectors = ArrayUtil.copyOfSubArray(vectors, 0, ret); + } + writeFieldInternal(fieldInfo, vectors); + } catch (Throwable t) { + handleThrowable(t); + } + } + + @Override + public void finish() throws IOException { + if (finished) { + throw new IllegalStateException("already finished"); + } + finished = true; + flatVectorsWriter.finish(); + + if (meta != null) { + // write end of fields marker + meta.writeInt(-1); + CodecUtil.writeFooter(meta); + } + if (cuvsIndex != null) { + CodecUtil.writeFooter(cuvsIndex); + } + } + + @Override + public void close() throws IOException { + IOUtils.close(meta, cuvsIndex, flatVectorsWriter); + } + + @Override + public long ramBytesUsed() { + long total = SHALLOW_RAM_BYTES_USED; + for (var field : fields) { + total += field.ramBytesUsed(); + } + return total; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java new file mode 100644 index 000000000000..842fdde65dd2 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSProvider.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import com.nvidia.cuvs.BruteForceIndex; +import com.nvidia.cuvs.CagraIndex; +import com.nvidia.cuvs.CuVSResources; +import com.nvidia.cuvs.HnswIndex; +import com.nvidia.cuvs.spi.CuVSProvider; +import java.nio.file.Path; + +/*package-private*/ class FilterCuVSProvider implements CuVSProvider { + + private final CuVSProvider delegate; + + FilterCuVSProvider(CuVSProvider delegate) { + this.delegate = delegate; + } + + @Override + public Path nativeLibraryPath() { + return CuVSProvider.TMPDIR; + } + + @Override + public CuVSResources newCuVSResources(Path tempPath) throws Throwable { + return delegate.newCuVSResources(tempPath); + } + + @Override + public BruteForceIndex.Builder newBruteForceIndexBuilder(CuVSResources cuVSResources) + throws UnsupportedOperationException { + return delegate.newBruteForceIndexBuilder(cuVSResources); + } + + @Override + public CagraIndex.Builder newCagraIndexBuilder(CuVSResources cuVSResources) + throws UnsupportedOperationException { + return delegate.newCagraIndexBuilder(cuVSResources); + } + + @Override + public HnswIndex.Builder newHnswIndexBuilder(CuVSResources cuVSResources) + throws UnsupportedOperationException { + return delegate.newHnswIndexBuilder(cuVSResources); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java new file mode 100644 index 000000000000..eeb7b6895aa3 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/FilterCuVSServiceProvider.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import com.nvidia.cuvs.spi.CuVSProvider; +import com.nvidia.cuvs.spi.CuVSServiceProvider; + +/** A provider that creates instances of FilterCuVSProvider. */ +public class FilterCuVSServiceProvider extends CuVSServiceProvider { + @Override + public CuVSProvider get(CuVSProvider builtinProvider) { + return new FilterCuVSProvider(builtinProvider); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/IndexInputInputStream.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/IndexInputInputStream.java new file mode 100644 index 000000000000..4eb8ed558f70 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/IndexInputInputStream.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; +import java.io.InputStream; +import org.apache.lucene.store.IndexInput; + +/** InputStream for reading from an IndexInput. */ +final class IndexInputInputStream extends InputStream { + + final IndexInput in; + long pos = 0; + final long limit; + + IndexInputInputStream(IndexInput in) { + this.in = in; + this.limit = in.length(); + } + + @Override + public int read() throws IOException { + if (pos >= limit) { + return -1; + } + pos++; + return in.readByte(); + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + if (len <= 0) { + return 0; + } + if (pos >= limit) { + return -1; + } + long avail = limit - pos; + if (len > avail) { + len = (int) avail; + } + in.readBytes(b, off, len); + pos += len; + return len; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/IndexOutputOutputStream.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/IndexOutputOutputStream.java new file mode 100644 index 000000000000..ffb2b922e4b5 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/IndexOutputOutputStream.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; +import java.io.OutputStream; +import org.apache.lucene.store.IndexOutput; + +/** OutputStream for writing into an IndexOutput */ +final class IndexOutputOutputStream extends OutputStream { + + static final int DEFAULT_BUFFER_SIZE = 8192; + + final IndexOutput out; + final int bufferSize; + final byte[] buffer; + int idx; + + IndexOutputOutputStream(IndexOutput out) { + this(out, DEFAULT_BUFFER_SIZE); + } + + IndexOutputOutputStream(IndexOutput out, int bufferSize) { + this.out = out; + this.bufferSize = bufferSize; + this.buffer = new byte[bufferSize]; + } + + @Override + public void write(int b) throws IOException { + buffer[idx] = (byte) b; + idx++; + if (idx == bufferSize) { + flush(); + } + } + + @Override + public void write(byte[] b, int offset, int length) throws IOException { + if (idx != 0) { + flush(); + } + out.writeBytes(b, offset, length); + } + + @Override + public void flush() throws IOException { + out.writeBytes(buffer, 0, idx); + idx = 0; + } + + @Override + public void close() throws IOException { + this.flush(); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java new file mode 100644 index 000000000000..caf9566064e9 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/PerLeafCuVSKnnCollector.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.util.ArrayList; +import java.util.List; +import org.apache.lucene.search.KnnCollector; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TotalHits; + +/** KnnCollector for CuVS */ +/*package-private*/ class PerLeafCuVSKnnCollector implements KnnCollector { + + public List scoreDocs; + public int topK = 0; + public int iTopK = topK; // TODO getter, no setter + public int searchWidth = 1; // TODO getter, no setter + public int results = 0; + + public PerLeafCuVSKnnCollector(int topK, int iTopK, int searchWidth) { + super(); + this.topK = topK; + this.iTopK = iTopK; + this.searchWidth = searchWidth; + scoreDocs = new ArrayList(); + } + + @Override + public boolean earlyTerminated() { + // TODO: may need implementation + return false; + } + + @Override + public void incVisitedCount(int count) { + // TODO: may need implementation + } + + @Override + public long visitedCount() { + // TODO: may need implementation + return 0; + } + + @Override + public long visitLimit() { + // TODO: may need implementation + return 0; + } + + @Override + public int k() { + return topK; + } + + @Override + @SuppressWarnings("cast") + public boolean collect(int docId, float similarity) { + scoreDocs.add(new ScoreDoc(docId, similarity)); + return true; + } + + @Override + public float minCompetitiveSimilarity() { + // TODO: may need implementation + return 0; + } + + @Override + public TopDocs topDocs() { + return new TopDocs( + new TotalHits(scoreDocs.size(), TotalHits.Relation.EQUAL_TO), + scoreDocs.toArray(new ScoreDoc[scoreDocs.size()])); + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java new file mode 100644 index 000000000000..86c56b909dd1 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/vectorsearch/package-info.java @@ -0,0 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** CuVS based fast vector search */ +package org.apache.lucene.sandbox.vectorsearch; diff --git a/lucene/sandbox/src/resources/META-INF/services/com.nvidia.cuvs.spi.CuVSServiceProvider b/lucene/sandbox/src/resources/META-INF/services/com.nvidia.cuvs.spi.CuVSServiceProvider new file mode 100644 index 000000000000..5e7ceba19343 --- /dev/null +++ b/lucene/sandbox/src/resources/META-INF/services/com.nvidia.cuvs.spi.CuVSServiceProvider @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.lucene.sandbox.vectorsearch.FilterCuVSServiceProvider \ No newline at end of file diff --git a/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat new file mode 100644 index 000000000000..666ee726f986 --- /dev/null +++ b/lucene/sandbox/src/resources/META-INF/services/org.apache.lucene.codecs.KnnVectorsFormat @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.lucene.sandbox.vectorsearch.CuVSVectorsFormat diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java new file mode 100644 index 000000000000..a20a49be6f53 --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVS.java @@ -0,0 +1,208 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.TreeMap; +import java.util.logging.Logger; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.KnnFloatVectorField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.KnnFloatVectorQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.analysis.MockAnalyzer; +import org.apache.lucene.tests.analysis.MockTokenizer; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.tests.util.English; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.LuceneTestCase.SuppressSysoutChecks; +import org.apache.lucene.tests.util.TestUtil; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +@SuppressSysoutChecks(bugUrl = "prints info from within cuvs") +public class TestCuVS extends LuceneTestCase { + + protected static Logger log = Logger.getLogger(TestCuVS.class.getName()); + + static final Codec codec = TestUtil.alwaysKnnVectorsFormat(new CuVSVectorsFormat()); + static IndexSearcher searcher; + static IndexReader reader; + static Directory directory; + + static int DATASET_SIZE_LIMIT = 1000; + static int DIMENSIONS_LIMIT = 2048; + static int NUM_QUERIES_LIMIT = 10; + static int TOP_K_LIMIT = 64; // TODO This fails beyond 64 + + public static float[][] dataset; + + @BeforeClass + public static void beforeClass() throws Exception { + assumeTrue("cuvs not supported", CuVSVectorsFormat.supported()); + directory = newDirectory(); + + RandomIndexWriter writer = + new RandomIndexWriter( + random(), + directory, + newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true)) + .setMaxBufferedDocs(TestUtil.nextInt(random(), 100, 1000)) + .setCodec(codec) + .setMergePolicy(newTieredMergePolicy())); + + log.info("Merge Policy: " + writer.w.getConfig().getMergePolicy()); + + Random random = random(); + int datasetSize = random.nextInt(DATASET_SIZE_LIMIT) + 1; + int dimensions = random.nextInt(DIMENSIONS_LIMIT) + 1; + dataset = generateDataset(random, datasetSize, dimensions); + for (int i = 0; i < datasetSize; i++) { + Document doc = new Document(); + doc.add(new StringField("id", String.valueOf(i), Field.Store.YES)); + doc.add(newTextField("field", English.intToEnglish(i), Field.Store.YES)); + boolean skipVector = + random.nextInt(10) < 0; // disable testing with holes for now, there's some bug. + if (!skipVector + || datasetSize < 100) { // about 10th of the documents shouldn't have a single vector + doc.add(new KnnFloatVectorField("vector", dataset[i], VectorSimilarityFunction.EUCLIDEAN)); + doc.add(new KnnFloatVectorField("vector2", dataset[i], VectorSimilarityFunction.EUCLIDEAN)); + } + + writer.addDocument(doc); + } + + reader = writer.getReader(); + searcher = newSearcher(reader); + writer.close(); + } + + @AfterClass + public static void afterClass() throws Exception { + if (reader != null) reader.close(); + if (directory != null) directory.close(); + searcher = null; + reader = null; + directory = null; + log.info("Test finished"); + } + + @Test + public void testVectorSearch() throws IOException { + Random random = random(); + int numQueries = random.nextInt(NUM_QUERIES_LIMIT) + 1; + int topK = Math.min(random.nextInt(TOP_K_LIMIT) + 1, dataset.length); + + if (dataset.length < topK) topK = dataset.length; + + float[][] queries = generateQueries(random, dataset[0].length, numQueries); + List> expected = generateExpectedResults(topK, dataset, queries); + + log.info("Dataset size: " + dataset.length + "x" + dataset[0].length); + log.info("Query size: " + numQueries + "x" + queries[0].length); + log.info("TopK: " + topK); + + // Query query = new CuVSKnnFloatVectorQuery("vector", queries[0], topK, topK, 1); + Query query = new KnnFloatVectorQuery("vector", queries[0], topK); + int correct[] = new int[topK]; + for (int i = 0; i < topK; i++) correct[i] = expected.get(0).get(i); + + ScoreDoc[] hits = searcher.search(query, topK).scoreDocs; + log.info("RESULTS: " + Arrays.toString(hits)); + log.info("EXPECTD: " + expected.get(0)); + + for (ScoreDoc hit : hits) { + log.info("\t" + reader.storedFields().document(hit.doc).get("id") + ": " + hit.score); + } + + for (ScoreDoc hit : hits) { + int doc = Integer.parseInt(reader.storedFields().document(hit.doc).get("id")); + assertTrue("Result returned was not in topk*2: " + doc, expected.get(0).contains(doc)); + } + } + + private static float[][] generateQueries(Random random, int dimensions, int numQueries) { + // Generate random query vectors + float[][] queries = new float[numQueries][dimensions]; + for (int i = 0; i < numQueries; i++) { + for (int j = 0; j < dimensions; j++) { + queries[i][j] = random.nextFloat() * 100; + } + } + return queries; + } + + private static float[][] generateDataset(Random random, int datasetSize, int dimensions) { + // Generate a random dataset + float[][] dataset = new float[datasetSize][dimensions]; + for (int i = 0; i < datasetSize; i++) { + for (int j = 0; j < dimensions; j++) { + dataset[i][j] = random.nextFloat() * 100; + } + } + return dataset; + } + + private static List> generateExpectedResults( + int topK, float[][] dataset, float[][] queries) { + List> neighborsResult = new ArrayList<>(); + int dimensions = dataset[0].length; + + for (float[] query : queries) { + Map distances = new TreeMap<>(); + for (int j = 0; j < dataset.length; j++) { + double distance = 0; + for (int k = 0; k < dimensions; k++) { + distance += (query[k] - dataset[j][k]) * (query[k] - dataset[j][k]); + } + distances.put(j, (distance)); + } + + Map sorted = new TreeMap(distances); + log.info("EXPECTED: " + sorted); + + // Sort by distance and select the topK nearest neighbors + List neighbors = + distances.entrySet().stream() + .sorted(Map.Entry.comparingByValue()) + .map(Map.Entry::getKey) + .toList(); + neighborsResult.add( + neighbors.subList( + 0, + Math.min( + topK * 3, + dataset.length))); // generate double the topK results in the expected array + } + + log.info("Expected results generated successfully."); + return neighborsResult; + } +} diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java new file mode 100644 index 000000000000..dbbdecf82ec9 --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestCuVSVectorsFormat.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import static org.apache.lucene.index.VectorSimilarityFunction.EUCLIDEAN; + +import java.util.List; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.KnnFloatVectorField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.VectorEncoding; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; +import org.apache.lucene.tests.util.TestUtil; +import org.junit.BeforeClass; + +public class TestCuVSVectorsFormat extends BaseKnnVectorsFormatTestCase { + + @BeforeClass + public static void beforeClass() { + assumeTrue("cuvs is not supported", CuVSVectorsFormat.supported()); + } + + @Override + protected Codec getCodec() { + return TestUtil.alwaysKnnVectorsFormat(new CuVSVectorsFormat()); + // For convenience, to sanitize the test code, one can comment out + // the supported check and use another format, e.g. + // return TestUtil.alwaysKnnVectorsFormat(new Lucene99HnswVectorsFormat()); + } + + @Override + protected List supportedVectorEncodings() { + return List.of(VectorEncoding.FLOAT32); + } + + public void testMergeTwoSegsWithASingleDocPerSeg() throws Exception { + float[][] f = new float[][] {randomVector(384), randomVector(384)}; + try (Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) { + Document doc1 = new Document(); + doc1.add(new StringField("id", "0", Field.Store.NO)); + doc1.add(new KnnFloatVectorField("f", f[0], EUCLIDEAN)); + w.addDocument(doc1); + w.commit(); + Document doc2 = new Document(); + doc2.add(new StringField("id", "1", Field.Store.NO)); + doc2.add(new KnnFloatVectorField("f", f[1], EUCLIDEAN)); + w.addDocument(doc2); + w.flush(); + w.commit(); + + // sanity - verify one doc per leaf + try (DirectoryReader reader = DirectoryReader.open(w)) { + List subReaders = reader.leaves(); + assertEquals(2, subReaders.size()); + assertEquals(1, subReaders.get(0).reader().getFloatVectorValues("f").size()); + assertEquals(1, subReaders.get(1).reader().getFloatVectorValues("f").size()); + } + + // now merge to a single segment + w.forceMerge(1); + + // verify merged content + try (DirectoryReader reader = DirectoryReader.open(w)) { + LeafReader r = getOnlyLeafReader(reader); + FloatVectorValues values = r.getFloatVectorValues("f"); + assertNotNull(values); + assertEquals(2, values.size()); + assertArrayEquals(f[0], values.vectorValue(0), 0.0f); + assertArrayEquals(f[1], values.vectorValue(1), 0.0f); + } + } + } + + // Basic test for multiple vectors fields per document + public void testTwoVectorFieldsPerDoc() throws Exception { + float[][] f1 = new float[][] {randomVector(384), randomVector(384)}; + float[][] f2 = new float[][] {randomVector(384), randomVector(384)}; + try (Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) { + Document doc1 = new Document(); + doc1.add(new StringField("id", "0", Field.Store.NO)); + doc1.add(new KnnFloatVectorField("f1", f1[0], EUCLIDEAN)); + doc1.add(new KnnFloatVectorField("f2", f2[0], EUCLIDEAN)); + w.addDocument(doc1); + Document doc2 = new Document(); + doc2.add(new StringField("id", "1", Field.Store.NO)); + doc2.add(new KnnFloatVectorField("f1", f1[1], EUCLIDEAN)); + doc2.add(new KnnFloatVectorField("f2", f2[1], EUCLIDEAN)); + w.addDocument(doc2); + w.forceMerge(1); + + try (DirectoryReader reader = DirectoryReader.open(w)) { + LeafReader r = getOnlyLeafReader(reader); + FloatVectorValues values = r.getFloatVectorValues("f1"); + assertNotNull(values); + assertEquals(2, values.size()); + assertArrayEquals(f1[0], values.vectorValue(0), 0.0f); + assertArrayEquals(f1[1], values.vectorValue(1), 0.0f); + + values = r.getFloatVectorValues("f2"); + assertNotNull(values); + assertEquals(2, values.size()); + assertArrayEquals(f2[0], values.vectorValue(0), 0.0f); + assertArrayEquals(f2[1], values.vectorValue(1), 0.0f); + + // opportunistically check boundary condition - search with a 0 topK + var topDocs = r.searchNearestVectors("f1", randomVector(384), 0, null, 10); + assertEquals(0, topDocs.scoreDocs.length); + assertEquals(0, topDocs.totalHits.value()); + } + } + } +} diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestIndexOutputOutputStream.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestIndexOutputOutputStream.java new file mode 100644 index 000000000000..e2e2b7600e9d --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/vectorsearch/TestIndexOutputOutputStream.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.vectorsearch; + +import static org.apache.lucene.util.ArrayUtil.copyOfSubArray; + +import java.io.IOException; +import java.util.Random; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.tests.util.LuceneTestCase; + +public class TestIndexOutputOutputStream extends LuceneTestCase { + + public void testBasic() throws IOException { + try (var dir = newDirectory()) { + try (var indexOut = dir.createOutput("test", IOContext.DEFAULT)) { + var out = new IndexOutputOutputStream(indexOut); + out.write(0x56); + out.write(new byte[] {0x10, 0x11, 0x12, 0x13, 0x14}); + out.close(); + } + + try (var indexIn = dir.openInput("test", IOContext.DEFAULT)) { + var in = new IndexInputInputStream(indexIn); + // assertEquals(0x56, in.read()); + byte[] ba = new byte[6]; + assertEquals(6, in.read(ba)); + assertArrayEquals(new byte[] {0x56, 0x10, 0x11, 0x12, 0x13, 0x14}, ba); + } + } + } + + public void testGetFilePointer() throws IOException { + try (var dir = newDirectory()) { + try (var indexOut = dir.createOutput("test", IOContext.DEFAULT)) { + var out = new IndexOutputOutputStream(indexOut); + out.write(0x56); + out.write(new byte[] {0x10, 0x11, 0x12}); + assertEquals(4, indexOut.getFilePointer()); + out.close(); + } + } + } + + public void testWithRandom() throws IOException { + byte[] data = new byte[Math.min(atLeast(10_000), 20_000)]; + Random random = random(); + random.nextBytes(data); + + try (var dir = newDirectory()) { + try (var indexOut = dir.createOutput("test", IOContext.DEFAULT)) { + var out = new IndexOutputOutputStream(indexOut); + int i = 0; + while (i < data.length) { + if (random.nextBoolean()) { + out.write(data[i]); + i++; + } else { + int numBytes = random.nextInt(Math.min(data.length - i, 100)); + out.write(data, i, numBytes); + i += numBytes; + } + } + out.close(); + } + + try (var indexIn = dir.openInput("test", IOContext.DEFAULT)) { + var in = new IndexInputInputStream(indexIn); + int i = 0; + while (i < data.length) { + if (random.nextBoolean()) { + int b = in.read(); + assertEquals(data[i], b); + i++; + } else { + int numBytes = random.nextInt(Math.min(data.length - i, 100)); + byte[] ba = new byte[numBytes]; + in.read(ba, 0, numBytes); + assertArrayEquals(copyOfSubArray(data, i, i + numBytes), ba); + i += numBytes; + } + } + assertEquals(-1, in.read()); + assertEquals(-1, in.read(new byte[2])); + } + } + } +} diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java index 97b578e7c5cd..ed1a76133968 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java @@ -296,6 +296,7 @@ public KnnVectorsFormat knnVectorsFormat() { } public void testMergingWithDifferentByteKnnFields() throws Exception { + assumeTrue("bytes not supported", supportedVectorEncodings().contains(VectorEncoding.BYTE)); try (var dir = newDirectory()) { IndexWriterConfig iwc = new IndexWriterConfig(); Codec codec = getCodec(); @@ -994,6 +995,7 @@ public void testFloatVectorScorerIteration() throws Exception { } public void testByteVectorScorerIteration() throws Exception { + assumeTrue("bytes not supported", supportedVectorEncodings().contains(VectorEncoding.BYTE)); IndexWriterConfig iwc = newIndexWriterConfig(); if (random().nextBoolean()) { iwc.setIndexSort(new Sort(new SortField("sortkey", SortField.Type.INT))); @@ -1081,6 +1083,7 @@ public void testEmptyFloatVectorData() throws Exception { } public void testEmptyByteVectorData() throws Exception { + assumeTrue("bytes not supported", supportedVectorEncodings().contains(VectorEncoding.BYTE)); try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig())) { var doc1 = new Document(); @@ -1112,11 +1115,16 @@ protected VectorSimilarityFunction randomSimilarity() { } /** - * This method is overrideable since old codec versions only support {@link - * VectorEncoding#FLOAT32}. + * The vector encodings supported by the format. Defaults to all VectorEncoding.values(). Override + * if the format only supports a subset of these encodings. */ + protected List supportedVectorEncodings() { + return Arrays.stream(VectorEncoding.values()).toList(); + } + protected VectorEncoding randomVectorEncoding() { - return VectorEncoding.values()[random().nextInt(VectorEncoding.values().length)]; + var encodings = supportedVectorEncodings().toArray(VectorEncoding[]::new); + return encodings[random().nextInt(encodings.length)]; } public void testIndexedValueNotAliased() throws Exception { @@ -1193,6 +1201,7 @@ public void testSortedIndex() throws Exception { } public void testSortedIndexBytes() throws Exception { + assumeTrue("bytes not supported", supportedVectorEncodings().contains(VectorEncoding.BYTE)); IndexWriterConfig iwc = newIndexWriterConfig(); iwc.setIndexSort(new Sort(new SortField("sortkey", SortField.Type.INT))); String fieldName = "field"; @@ -1361,6 +1370,7 @@ public void testRandom() throws Exception { * back consistently. */ public void testRandomBytes() throws Exception { + assumeTrue("bytes not supported", supportedVectorEncodings().contains(VectorEncoding.BYTE)); IndexWriterConfig iwc = newIndexWriterConfig(); if (random().nextBoolean()) { iwc.setIndexSort(new Sort(new SortField("sortkey", SortField.Type.INT))); @@ -1875,6 +1885,7 @@ public void testVectorValuesReportCorrectDocs() throws Exception { } public void testMismatchedFields() throws Exception { + assumeTrue("bytes not supported", supportedVectorEncodings().contains(VectorEncoding.BYTE)); Directory dir1 = newDirectory(); IndexWriter w1 = new IndexWriter(dir1, newIndexWriterConfig()); Document doc = new Document(); diff --git a/versions.lock b/versions.lock index 07f8ff30543d..a98d277acf2c 100644 --- a/versions.lock +++ b/versions.lock @@ -4,6 +4,7 @@ "main_dependencies" : { "com.carrotsearch.randomizedtesting:randomizedtesting-runner:2.8.1" : "fa9ef26b,refs=4", "com.ibm.icu:icu4j:74.2" : "47ea4550,refs=6", + "com.nvidia.cuvs:cuvs-java:25.02.0" : "0129b4f0,refs=6", "commons-codec:commons-codec:1.17.2" : "e9962aab,refs=4", "io.sgr:s2-geometry-library-java:1.0.0" : "cbc357ab,refs=4", "junit:junit:4.13.1" : "fa9ef26b,refs=4", @@ -11,6 +12,7 @@ "net.sourceforge.nekohtml:nekohtml:1.9.17" : "5ce8cdc6,refs=2", "org.antlr:antlr4-runtime:4.11.1" : "d9953130,refs=4", "org.apache.commons:commons-compress:1.19" : "5ce8cdc6,refs=2", + "org.apache.commons:commons-lang3:3.17.0" : "0129b4f0,refs=6", "org.apache.commons:commons-math3:3.6.1" : "85a1e4c6,refs=2", "org.apache.opennlp:opennlp-tools:2.5.3" : "2f760bab,refs=4", "org.carrot2:morfologik-fsa:2.1.9" : "79af844b,refs=4", @@ -46,6 +48,7 @@ "com.google.j2objc:j2objc-annotations:1.3" : "6897bc09,refs=38", "com.google.protobuf:protobuf-java:3.19.2" : "6897bc09,refs=38", "com.ibm.icu:icu4j:74.2" : "ffa00415,refs=8", + "com.nvidia.cuvs:cuvs-java:25.02.0" : "7ac6f8d9,refs=9", "commons-codec:commons-codec:1.17.2" : "733734f0,refs=6", "io.github.java-diff-utils:java-diff-utils:4.0" : "6897bc09,refs=38", "io.sgr:s2-geometry-library-java:1.0.0" : "1d5a4b2b,refs=4", @@ -55,6 +58,7 @@ "net.sourceforge.nekohtml:nekohtml:1.9.17" : "6f16ff86,refs=2", "org.antlr:antlr4-runtime:4.11.1" : "6fbc4021,refs=5", "org.apache.commons:commons-compress:1.19" : "6f16ff86,refs=2", + "org.apache.commons:commons-lang3:3.17.0" : "7ac6f8d9,refs=9", "org.apache.commons:commons-math3:3.6.1" : "152d9f78,refs=3", "org.apache.opennlp:opennlp-tools:2.5.3" : "b91715f0,refs=6", "org.assertj:assertj-core:3.21.0" : "b7ba1646,refs=2", @@ -79,6 +83,32 @@ } }, "because" : { + "0129b4f0" : [ + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "compileClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "runtimeClasspath", + "projectPath" : ":lucene:sandbox" + } + ], "152d9f78" : [ { "configuration" : "annotationProcessor", @@ -405,6 +435,44 @@ "projectPath" : ":lucene:analysis:morfologik" } ], + "7ac6f8d9" : [ + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:benchmark" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:demo" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:highlighter" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:luke" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:memory" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:monitor" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:queryparser" + }, + { + "configuration" : "testCompileClasspath", + "projectPath" : ":lucene:sandbox" + }, + { + "configuration" : "testRuntimeClasspath", + "projectPath" : ":lucene:sandbox" + } + ], "85a1e4c6" : [ { "configuration" : "compileClasspath", @@ -932,4 +1000,4 @@ } ] } -} \ No newline at end of file +} diff --git a/versions.toml b/versions.toml index 679287f9d7db..7688f235a691 100644 --- a/versions.toml +++ b/versions.toml @@ -4,6 +4,8 @@ asm = "9.6" assertj = "3.21.0" commons-codec = "1.17.2" commons-compress = "1.19" +commons-lang3 = "3.17.0" +cuvs = "25.02.0" ecj = "3.36.0" errorprone = "2.18.0" flexmark = "0.61.24" @@ -42,6 +44,8 @@ asm-core = { module = "org.ow2.asm:asm", version.ref = "asm" } assertj = { module = "org.assertj:assertj-core", version.ref = "assertj" } commons-codec = { module = "commons-codec:commons-codec", version.ref = "commons-codec" } commons-compress = { module = "org.apache.commons:commons-compress", version.ref = "commons-compress" } +commons-lang3 = { module = "org.apache.commons:commons-lang3", version.ref = "commons-lang3" } +cuvs = { module = "com.nvidia.cuvs:cuvs-java", version.ref = "cuvs" } ecj = { module = "org.eclipse.jdt:ecj", version.ref = "ecj" } errorprone = { module = "com.google.errorprone:error_prone_core", version.ref = "errorprone" } flexmark-core = { module = "com.vladsch.flexmark:flexmark", version.ref = "flexmark" }