opensearch-project · jainankitk · Apr 21, 2025 · Apr 11, 2025 · Apr 12, 2025 · Apr 11, 2025
@@ -12,6 +12,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - [Rule Based Auto-tagging] Add in-memory attribute value store ([#17342](https://github.com/opensearch-project/OpenSearch/pull/17342))
 - [Rule Based Auto-tagging] Add in-memory rule processing service ([#17365](https://github.com/opensearch-project/OpenSearch/pull/17365))
 - Change priority for scheduling reroute during timeout([#16445](https://github.com/opensearch-project/OpenSearch/pull/16445))
+- Faster `terms_query` with already sorted terms ([#17714](https://github.com/opensearch-project/OpenSearch/pull/17714))
 
 ### Dependencies
 - Bump `dnsjava:dnsjava` from 3.6.2 to 3.6.3 ([#17231](https://github.com/opensearch-project/OpenSearch/pull/17231))

@@ -0,0 +1,182 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.index.mapper;
+
+import org.apache.lucene.search.TermInSetQuery;
+import org.apache.lucene.util.BytesRef;
+
+import java.util.AbstractSet;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+import java.util.SortedSet;
+import java.util.function.Consumer;
+import java.util.function.Function;
+import java.util.function.Supplier;
+
+/**
+ * Purposed for passing terms into {@link TermInSetQuery}.
+ * If the given terms are sorted already, it wrap it with a SortedSet stub.
+ * Otherwise, it passes terms as list.
+ */
+public class BytesRefsCollectionBuilder implements Consumer<BytesRef>, Supplier<Collection<BytesRef>> {
+
+    /**
+     * Strategy for building BytesRef collection.
+     * */
+    protected interface ConsumerStrategy extends Function<BytesRef, ConsumerStrategy>, Supplier<Collection<BytesRef>> {}
+
+    public BytesRefsCollectionBuilder(int sizeExpected) {
+        terms = new ArrayList<>(sizeExpected);
+    }
+
+    protected final List<BytesRef> terms;
+    protected ConsumerStrategy delegate = createStartStrategy();
+
+    @Override
+    public void accept(BytesRef bytesRef) {
+        delegate = delegate.apply(bytesRef);
+    }
+
+    @Override
+    public Collection<BytesRef> get() {
+        Collection<BytesRef> result = delegate.get();
+        delegate = createFrozenStrategy(result);
+        return result;
+    }
+
+    protected ConsumerStrategy createStartStrategy() {
+        return new ConsumerStrategy() {
+            @Override
+            public ConsumerStrategy apply(BytesRef firstBytes) {
+                terms.add(firstBytes); // firstly, just store
+                return createSortedStrategy(firstBytes);
+            }
+
+            @Override
+            public Collection<BytesRef> get() {
+                return terms; // empty list
+            }
+        };
+    }
+
+    protected ConsumerStrategy createSortedStrategy(BytesRef firstBytes) {
+        return new ConsumerStrategy() {
+            BytesRef prev = firstBytes;
+
+            @Override
+            public ConsumerStrategy apply(BytesRef bytesRef) {
+                terms.add(bytesRef);
+                if (bytesRef.compareTo(prev) >= 0) { // keep checking sorted
+                    prev = bytesRef;
+                    return this;
+                } else { // isn't sorted
+                    return createUnsortedStrategy();
+                }
+            }
+
+            @Override
+            public Collection<BytesRef> get() {
+                return new SortedBytesSet(terms);
+            }
+        };
+    }
+
+    protected ConsumerStrategy createUnsortedStrategy() {
+        return new ConsumerStrategy() {
+            @Override
+            public ConsumerStrategy apply(BytesRef bytesRef) { // just storing
+                terms.add(bytesRef);
+                return this;
+            }
+
+            @Override
+            public Collection<BytesRef> get() {
+                return terms;
+            }
+        };
+    }
+
+    protected ConsumerStrategy createFrozenStrategy(Collection<BytesRef> result) {
+        return new ConsumerStrategy() {
+
+            @Override
+            public ConsumerStrategy apply(BytesRef bytesRef) {
+                throw new IllegalStateException("already build");
+            }
+
+            @Override
+            public Collection<BytesRef> get() {
+                return result;
+            }
+        };
+    }
+
+    /**
+     * {@link SortedSet<BytesRef>} for passing into TermInSetQuery()
+     * */
+    protected static class SortedBytesSet extends AbstractSet<BytesRef> implements SortedSet<BytesRef> {
+
+        private final List<BytesRef> bytesRefs;
+
+        public SortedBytesSet(List<BytesRef> bytesRefs) {
+            this.bytesRefs = bytesRefs;
+        }
+
+        @Override
+        public Iterator<BytesRef> iterator() {
+            return bytesRefs.iterator();
+        }
+
+        @Override
+        public int size() {
+            return bytesRefs.size();
+        }
+
+        @Override
+        public Comparator<? super BytesRef> comparator() {
+            return null;
+        }
+
+        @Override
+        public SortedSet<BytesRef> subSet(BytesRef fromElement, BytesRef toElement) {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public SortedSet<BytesRef> headSet(BytesRef toElement) {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public SortedSet<BytesRef> tailSet(BytesRef fromElement) {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public BytesRef first() {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public BytesRef last() {
+            throw new UnsupportedOperationException();
+        }
+
+        /**
+         * Dedicated for {@link TermInSetQuery#TermInSetQuery(String, Collection)}.
+         */
+        @Override
+        public <T> T[] toArray(T[] a) {
+            return bytesRefs.toArray(a);
+        }
+    }
+}
@@ -163,15 +163,15 @@ public Query existsQuery(QueryShardContext context) {
         @Override
         public Query termsQuery(List<?> values, QueryShardContext context) {
             failIfNotIndexed();
-            BytesRef[] bytesRefs = new BytesRef[values.size()];
-            for (int i = 0; i < bytesRefs.length; i++) {
+            BytesRefsCollectionBuilder bytesRefs = new BytesRefsCollectionBuilder(values.size());
+            for (int i = 0; i < values.size(); i++) {
                 Object idObject = values.get(i);
                 if (idObject instanceof BytesRef) {
                     idObject = ((BytesRef) idObject).utf8ToString();
                 }
-                bytesRefs[i] = Uid.encodeId(idObject.toString());
+                bytesRefs.accept(Uid.encodeId(idObject.toString()));
             }
-            return new TermInSetQuery(name(), bytesRefs);
+            return new TermInSetQuery(name(), bytesRefs.get());
         }
 
         @Override

@@ -447,23 +447,26 @@ public Query termsQuery(List<?> values, QueryShardContext context) {
                 if (!context.keywordFieldIndexOrDocValuesEnabled()) {
                     return super.termsQuery(values, context);
                 }
-                BytesRef[] iBytesRefs = new BytesRef[values.size()];
-                BytesRef[] dVByteRefs = new BytesRef[values.size()];
-                for (int i = 0; i < iBytesRefs.length; i++) {
-                    iBytesRefs[i] = indexedValueForSearch(values.get(i));
-                    dVByteRefs[i] = indexedValueForSearch(rewriteForDocValue(values.get(i)));
+                BytesRefsCollectionBuilder iBytesRefs = new BytesRefsCollectionBuilder(values.size());
+                BytesRefsCollectionBuilder dVByteRefs = new BytesRefsCollectionBuilder(values.size());
+                for (int i = 0; i < values.size(); i++) {
+                    BytesRef idxBytes = indexedValueForSearch(values.get(i));
+                    iBytesRefs.accept(idxBytes);
+                    BytesRef dvBytes = indexedValueForSearch(rewriteForDocValue(values.get(i)));
+                    dVByteRefs.accept(dvBytes);
                 }
-                Query indexQuery = new TermInSetQuery(name(), iBytesRefs);
-                Query dvQuery = new TermInSetQuery(MultiTermQuery.DOC_VALUES_REWRITE, name(), dVByteRefs);
+                Query indexQuery = new TermInSetQuery(name(), iBytesRefs.get());
+                Query dvQuery = new TermInSetQuery(MultiTermQuery.DOC_VALUES_REWRITE, name(), dVByteRefs.get());
                 return new IndexOrDocValuesQuery(indexQuery, dvQuery);
             }
             // if we only have doc_values enabled, we construct a new query with doc_values re-written
             if (hasDocValues()) {
-                BytesRef[] bytesRefs = new BytesRef[values.size()];
-                for (int i = 0; i < bytesRefs.length; i++) {
-                    bytesRefs[i] = indexedValueForSearch(rewriteForDocValue(values.get(i)));
+                BytesRefsCollectionBuilder bytesCollector = new BytesRefsCollectionBuilder(values.size());
+                for (int i = 0; i < values.size(); i++) {
+                    BytesRef dvBytes = indexedValueForSearch(rewriteForDocValue(values.get(i)));
+                    bytesCollector.accept(dvBytes);
                 }
-                return new TermInSetQuery(MultiTermQuery.DOC_VALUES_REWRITE, name(), bytesRefs);
+                return new TermInSetQuery(MultiTermQuery.DOC_VALUES_REWRITE, name(), bytesCollector.get());
             }
             // has index enabled, we're going to return the query as is
             return super.termsQuery(values, context);

@@ -93,11 +93,12 @@ public Query termQuery(Object value, QueryShardContext context) {
     @Override
     public Query termsQuery(List<?> values, QueryShardContext context) {
         failIfNotIndexed();
-        BytesRef[] bytesRefs = new BytesRef[values.size()];
-        for (int i = 0; i < bytesRefs.length; i++) {
-            bytesRefs[i] = indexedValueForSearch(values.get(i));
+        BytesRefsCollectionBuilder bytesCollector = new BytesRefsCollectionBuilder(values.size());
+        for (int i = 0; i < values.size(); i++) {
+            BytesRef elem = indexedValueForSearch(values.get(i));
+            bytesCollector.accept(elem);
         }
-        return new TermInSetQuery(name(), bytesRefs);
+        return new TermInSetQuery(name(), bytesCollector.get());
     }
 
 }
@@ -0,0 +1,85 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.index.mapper;
+
+import org.apache.lucene.util.BytesRef;
+import org.opensearch.test.OpenSearchTestCase;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.List;
+import java.util.SortedSet;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+public class BytesRefsCollectionBuilderTests extends OpenSearchTestCase {
+
+    public void testBuildSortedNotSorted() {
+        String[] seedStrings = generateRandomStringArray(10, 10, false, true);
+        List<BytesRef> bytesRefList = Arrays.stream(seedStrings).map(BytesRef::new).collect(Collectors.toList());
+        List<BytesRef> sortedBytesRefs = bytesRefList.stream().sorted().collect(Collectors.toList());
+
+        Collection<BytesRef> sortedSet = assertCollectionBuilt(sortedBytesRefs);
+        assertCollectionBuilt(bytesRefList);
+
+        assertTrue(sortedSet.isEmpty() || sortedSet instanceof SortedSet);
+        if (!sortedSet.isEmpty()) {
+            assertNull(((SortedSet<BytesRef>) sortedSet).comparator());
+        }
+    }
+
+    public void testBuildFooBar() {
+        String[] reverseOrderStrings = new String[] { "foo", "bar" };
+        List<BytesRef> bytesRefList = Arrays.stream(reverseOrderStrings).map(BytesRef::new).collect(Collectors.toList());
+        List<BytesRef> sortedBytesRefs = bytesRefList.stream().sorted().collect(Collectors.toList());
+
+        Collection<BytesRef> sortedSet = assertCollectionBuilt(sortedBytesRefs);
+        Collection<BytesRef> reverseList = assertCollectionBuilt(bytesRefList);
+
+        assertTrue(sortedSet instanceof SortedSet);
+        assertNull(((SortedSet<BytesRef>) sortedSet).comparator());
+
+        assertTrue(reverseList instanceof List);
+    }
+
+    public void testFrozen() {
+        BytesRefsCollectionBuilder builder = new BytesRefsCollectionBuilder(1);
+        String[] seedStrings = generateRandomStringArray(5, 10, false, true);
+        Arrays.stream(seedStrings).map(BytesRef::new).forEachOrdered(builder);
+        Collection<BytesRef> bytesRefCollection = builder.get();
+        assertNotNull(bytesRefCollection);
+        assertEquals(seedStrings.length, bytesRefCollection.size());
+        assertThrows(IllegalStateException.class, () -> builder.accept(new BytesRef("illegal state")));
+        assertSame(bytesRefCollection, builder.get());
+    }
+
+    private static Collection<BytesRef> assertCollectionBuilt(List<BytesRef> sortedBytesRefs) {
+        BytesRefsCollectionBuilder builder = new BytesRefsCollectionBuilder(1);
+        sortedBytesRefs.stream().forEachOrdered(builder);
+        Collection<BytesRef> bytesRefCollection = builder.get();
+        assertEquals(bytesRefCollection.size(), sortedBytesRefs.size());
+        for (Iterator<BytesRef> iterator = bytesRefCollection.iterator(), iterator2 = sortedBytesRefs.iterator(); iterator.hasNext()
+            || iterator2.hasNext();) {
+            assertTrue(iterator.next().bytesEquals(iterator2.next()));
+        }
+        return bytesRefCollection;
+    }
+
+    public void testCoverUnsupported() {
+        BytesRefsCollectionBuilder builder = new BytesRefsCollectionBuilder(1);
+        Stream.of("in", "order").map(BytesRef::new).forEachOrdered(builder);
+        SortedSet<BytesRef> bytesRefCollection = (SortedSet<BytesRef>) builder.get();
+        assertThrows(UnsupportedOperationException.class, () -> bytesRefCollection.subSet(new BytesRef("a"), new BytesRef("z")));
+        assertThrows(UnsupportedOperationException.class, () -> bytesRefCollection.headSet(new BytesRef("a")));
+        assertThrows(UnsupportedOperationException.class, () -> bytesRefCollection.tailSet(new BytesRef("a")));
+        assertThrows(UnsupportedOperationException.class, bytesRefCollection::first);
+        assertThrows(UnsupportedOperationException.class, bytesRefCollection::last);
+    }
+}
@@ -81,6 +81,7 @@
 import java.util.Collections;
 import java.util.List;
 import java.util.Map;
+import java.util.stream.Collectors;
 
 public class KeywordFieldTypeTests extends FieldTypeTestCase {
 
@@ -198,6 +199,27 @@ public void testTermsQuery() {
         );
     }
 
+    public void testTermsSortedQuery() {
+        String[] seedStrings = generateRandomStringArray(10, 10, false, true);
+        List<BytesRef> bytesRefList = Arrays.stream(seedStrings).map(BytesRef::new).collect(Collectors.toList());
+        List<String> sortedStrings = bytesRefList.stream().sorted().map(BytesRef::utf8ToString).collect(Collectors.toList());
+
+        MappedFieldType ft = new KeywordFieldType("field");
+        Query expected = new IndexOrDocValuesQuery(
+            new TermInSetQuery("field", bytesRefList),
+            new TermInSetQuery(MultiTermQuery.DOC_VALUES_REWRITE, "field", bytesRefList)
+        );
+        assertEquals(expected, ft.termsQuery(sortedStrings, MOCK_QSC_ENABLE_INDEX_DOC_VALUES));
+
+        MappedFieldType onlyIndexed = new KeywordFieldType("field", true, false, Collections.emptyMap());
+        Query expectedIndex = new TermInSetQuery("field", bytesRefList);
+        assertEquals(expectedIndex, onlyIndexed.termsQuery(sortedStrings, null));
+
+        MappedFieldType onlyDocValues = new KeywordFieldType("field", false, true, Collections.emptyMap());
+        Query expectedDocValues = new TermInSetQuery(MultiTermQuery.DOC_VALUES_REWRITE, "field", bytesRefList);
+        assertEquals(expectedDocValues, onlyDocValues.termsQuery(sortedStrings, null));
+    }
+
     public void testExistsQuery() {
         {
             KeywordFieldType ft = new KeywordFieldType("field");