Skip to content

Commit 5255eb3

Browse files
Forbid empty doc values on vector functions (#43944)
Currently when a document misses a vector value, vector function returns 0 as a score for this document. We think this is incorrect behaviour. With this change, an error will be thrown if vector functions are used with docs that are missing vector doc values. Also VectorScriptDocValues is modified to allow size() function, which can be used to check if a document has a value for the vector field.
1 parent 4db27c1 commit 5255eb3

File tree

5 files changed

+54
-10
lines changed

5 files changed

+54
-10
lines changed

docs/reference/query-dsl/script-score-query.asciidoc

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -195,8 +195,16 @@ between a given query vector and document vectors.
195195
// NOTCONSOLE
196196

197197
NOTE: If a document doesn't have a value for a vector field on which
198-
a vector function is executed, 0 is returned as a result
199-
for this document.
198+
a vector function is executed, an error will be thrown.
199+
200+
You can check if a document has a value for the field `my_vector` by
201+
`doc['my_vector'].size() == 0`. Your overall script can look like this:
202+
203+
[source,js]
204+
--------------------------------------------------
205+
"source": "doc['my_vector'].size() == 0 ? 0 : cosineSimilarity(params.queryVector, doc['my_vector'])"
206+
--------------------------------------------------
207+
// NOTCONSOLE
200208

201209
NOTE: If a document's dense vector field has a number of dimensions
202210
different from the query's vector, an error will be thrown.

x-pack/plugin/src/test/resources/rest-api-spec/test/vectors/20_dense_vector_special_cases.yml

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ setup:
131131
- match: { error.root_cause.0.type: "script_exception" }
132132

133133
---
134-
"Distance functions for documents missing vector field should return 0":
134+
"Documents missing a vector field":
135135
- do:
136136
index:
137137
index: test-index
@@ -149,7 +149,9 @@ setup:
149149
- do:
150150
indices.refresh: {}
151151

152+
# expect an error when documents miss a vector field
152153
- do:
154+
catch: bad_request
153155
headers:
154156
Content-Type: application/json
155157
search:
@@ -162,6 +164,22 @@ setup:
162164
source: "cosineSimilarity(params.query_vector, doc['my_dense_vector'])"
163165
params:
164166
query_vector: [10.0, 10.0, 10.0]
167+
- match: { error.root_cause.0.type: "script_exception" }
168+
169+
# guard against missing values by checking size()
170+
- do:
171+
headers:
172+
Content-Type: application/json
173+
search:
174+
rest_total_hits_as_int: true
175+
body:
176+
query:
177+
script_score:
178+
query: {match_all: {} }
179+
script:
180+
source: "doc['my_dense_vector'].size() == 0 ? 0 : cosineSimilarity(params.query_vector, doc['my_dense_vector'])"
181+
params:
182+
query_vector: [10.0, 10.0, 10.0]
165183

166184
- match: {hits.total: 2}
167185
- match: {hits.hits.0._id: "1"}

x-pack/plugin/src/test/resources/rest-api-spec/test/vectors/40_sparse_vector_special_cases.yml

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ setup:
8787
- match: {hits.hits.2._id: "3"}
8888

8989
---
90-
"Distance functions for documents missing vector field should return 0":
90+
"Documents missing a vector field":
9191
- do:
9292
index:
9393
index: test-index
@@ -105,7 +105,9 @@ setup:
105105
- do:
106106
indices.refresh: {}
107107

108+
# expect an error when documents miss a vector field
108109
- do:
110+
catch: bad_request
109111
headers:
110112
Content-Type: application/json
111113
search:
@@ -118,6 +120,22 @@ setup:
118120
source: "cosineSimilaritySparse(params.query_vector, doc['my_sparse_vector'])"
119121
params:
120122
query_vector: {"1": 10.0}
123+
- match: { error.root_cause.0.type: "script_exception" }
124+
125+
# guard against missing values by checking size()
126+
- do:
127+
headers:
128+
Content-Type: application/json
129+
search:
130+
rest_total_hits_as_int: true
131+
body:
132+
query:
133+
script_score:
134+
query: {match_all: {} }
135+
script:
136+
source: "doc['my_sparse_vector'].size() == 0 ? 0 : cosineSimilaritySparse(params.query_vector, doc['my_sparse_vector'])"
137+
params:
138+
query_vector: {"1": 10.0}
121139

122140
- match: {hits.total: 2}
123141
- match: {hits.hits.0._id: "1"}

x-pack/plugin/vectors/src/main/java/org/elasticsearch/xpack/vectors/query/ScoreScriptUtils.java

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ public class ScoreScriptUtils {
2828
*/
2929
public static double dotProduct(List<Number> queryVector, VectorScriptDocValues.DenseVectorScriptDocValues dvs){
3030
BytesRef value = dvs.getEncodedValue();
31-
if (value == null) return 0;
3231
float[] docVector = VectorEncoderDecoder.decodeDenseVector(value);
3332
if (queryVector.size() != docVector.length) {
3433
throw new IllegalArgumentException("Can't calculate dotProduct! The number of dimensions of the query vector [" +
@@ -63,7 +62,6 @@ public CosineSimilarity(List<Number> queryVector) {
6362

6463
public double cosineSimilarity(VectorScriptDocValues.DenseVectorScriptDocValues dvs) {
6564
BytesRef value = dvs.getEncodedValue();
66-
if (value == null) return 0;
6765
float[] docVector = VectorEncoderDecoder.decodeDenseVector(value);
6866
if (queryVector.size() != docVector.length) {
6967
throw new IllegalArgumentException("Can't calculate cosineSimilarity! The number of dimensions of the query vector [" +
@@ -129,7 +127,6 @@ public DotProductSparse(Map<String, Number> queryVector) {
129127

130128
public double dotProductSparse(VectorScriptDocValues.SparseVectorScriptDocValues dvs) {
131129
BytesRef value = dvs.getEncodedValue();
132-
if (value == null) return 0;
133130
int[] docDims = VectorEncoderDecoder.decodeSparseVectorDims(value);
134131
float[] docValues = VectorEncoderDecoder.decodeSparseVector(value);
135132
return intDotProductSparse(queryValues, queryDims, docValues, docDims);
@@ -174,7 +171,6 @@ public CosineSimilaritySparse(Map<String, Number> queryVector) {
174171

175172
public double cosineSimilaritySparse(VectorScriptDocValues.SparseVectorScriptDocValues dvs) {
176173
BytesRef value = dvs.getEncodedValue();
177-
if (value == null) return 0;
178174
int[] docDims = VectorEncoderDecoder.decodeSparseVectorDims(value);
179175
float[] docValues = VectorEncoderDecoder.decodeSparseVector(value);
180176

x-pack/plugin/vectors/src/main/java/org/elasticsearch/xpack/vectors/query/VectorScriptDocValues.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,16 @@ BytesRef getEncodedValue() {
4141

4242
@Override
4343
public BytesRef get(int index) {
44-
throw new UnsupportedOperationException("vector fields may only be used via vector functions in scripts");
44+
throw new UnsupportedOperationException("accessing a vector field's value through 'get' or 'value' is not supported");
4545
}
4646

4747
@Override
4848
public int size() {
49-
throw new UnsupportedOperationException("vector fields may only be used via vector functions in scripts");
49+
if (value == null) {
50+
return 0;
51+
} else {
52+
return 1;
53+
}
5054
}
5155

5256
// not final, as it needs to be extended by Mockito for tests

0 commit comments

Comments
 (0)