Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- [Stats] Add stats tracking for semantic highlighting ([#1327](https://github.com/opensearch-project/neural-search/pull/1327))
- [Stats] Add stats for text embedding processor with different settings ([#1332](https://github.com/opensearch-project/neural-search/pull/1332))
- Validate model id and analyzer should not be provided at the same time for the neural sparse query ([#1359](https://github.com/opensearch-project/neural-search/pull/1359))
- [Stats] Add stats for score based and rank based normalization processors ([#1326](https://github.com/opensearch-project/neural-search/pull/1326))

### Bug Fixes
- Fix score value as null for single shard when sorting is not done on score field ([#1277](https://github.com/opensearch-project/neural-search/pull/1277))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import static org.opensearch.neuralsearch.search.util.HybridSearchResultFormatUtil.isHybridQueryStartStopElement;

import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;
Expand All @@ -15,8 +16,16 @@
import org.opensearch.action.search.SearchPhaseContext;
import org.opensearch.action.search.SearchPhaseName;
import org.opensearch.action.search.SearchPhaseResults;
import org.opensearch.neuralsearch.processor.combination.ArithmeticMeanScoreCombinationTechnique;
import org.opensearch.neuralsearch.processor.combination.GeometricMeanScoreCombinationTechnique;
import org.opensearch.neuralsearch.processor.combination.HarmonicMeanScoreCombinationTechnique;
import org.opensearch.neuralsearch.processor.combination.ScoreCombinationTechnique;
import org.opensearch.neuralsearch.processor.normalization.L2ScoreNormalizationTechnique;
import org.opensearch.neuralsearch.processor.normalization.MinMaxScoreNormalizationTechnique;
import org.opensearch.neuralsearch.processor.normalization.ScoreNormalizationTechnique;
import org.opensearch.neuralsearch.processor.normalization.ZScoreNormalizationTechnique;
import org.opensearch.neuralsearch.stats.events.EventStatName;
import org.opensearch.neuralsearch.stats.events.EventStatsManager;
import org.opensearch.search.SearchPhaseResult;
import org.opensearch.search.fetch.FetchSearchResult;
import org.opensearch.search.pipeline.PipelineProcessingContext;
Expand All @@ -40,6 +49,24 @@ public class NormalizationProcessor extends AbstractScoreHybridizationProcessor
private final ScoreCombinationTechnique combinationTechnique;
private final NormalizationProcessorWorkflow normalizationWorkflow;

private final Map<String, Runnable> normTechniqueIncrementers = Map.of(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks like we have switch/case to increment for some processors, and runnable as map values for others. We should have one standard approach

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1. I am more inclined towards using map.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, can refactor the others as maps.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this map can be static, same for the second map combTechniqueIncrementers

L2ScoreNormalizationTechnique.TECHNIQUE_NAME,
() -> EventStatsManager.increment(EventStatName.NORM_TECHNIQUE_L2_EXECUTIONS),
MinMaxScoreNormalizationTechnique.TECHNIQUE_NAME,
() -> EventStatsManager.increment(EventStatName.NORM_TECHNIQUE_MINMAX_EXECUTIONS),
ZScoreNormalizationTechnique.TECHNIQUE_NAME,
() -> EventStatsManager.increment(EventStatName.NORM_TECHNIQUE_NORM_ZSCORE_EXECUTIONS)
);

private final Map<String, Runnable> combTechniqueIncrementers = Map.of(
ArithmeticMeanScoreCombinationTechnique.TECHNIQUE_NAME,
() -> EventStatsManager.increment(EventStatName.COMB_TECHNIQUE_ARITHMETIC_EXECUTIONS),
HarmonicMeanScoreCombinationTechnique.TECHNIQUE_NAME,
() -> EventStatsManager.increment(EventStatName.COMB_TECHNIQUE_HARMONIC_EXECUTIONS),
GeometricMeanScoreCombinationTechnique.TECHNIQUE_NAME,
() -> EventStatsManager.increment(EventStatName.COMB_TECHNIQUE_GEOMETRIC_EXECUTIONS)
);

@Override
<Result extends SearchPhaseResult> void hybridizeScores(
SearchPhaseResults<Result> searchPhaseResult,
Expand All @@ -54,6 +81,7 @@ <Result extends SearchPhaseResult> void hybridizeScores(
Optional<FetchSearchResult> fetchSearchResult = getFetchSearchResults(searchPhaseResult);
boolean explain = Objects.nonNull(searchPhaseContext.getRequest().source().explain())
&& searchPhaseContext.getRequest().source().explain();
recordStats(normalizationTechnique, combinationTechnique);
NormalizationProcessorWorkflowExecuteRequest request = NormalizationProcessorWorkflowExecuteRequest.builder()
.querySearchResults(querySearchResults)
.fetchSearchResultOptional(fetchSearchResult)
Expand Down Expand Up @@ -135,4 +163,10 @@ private <Result extends SearchPhaseResult> Optional<FetchSearchResult> getFetchS
Optional<Result> optionalFirstSearchPhaseResult = searchPhaseResults.getAtomicArray().asList().stream().findFirst();
return optionalFirstSearchPhaseResult.map(SearchPhaseResult::fetchResult);
}

private void recordStats(ScoreNormalizationTechnique normalizationTechnique, ScoreCombinationTechnique combinationTechnique) {
EventStatsManager.increment(EventStatName.NORMALIZATION_PROCESSOR_EXECUTIONS);
Optional.ofNullable(normTechniqueIncrementers.get(normalizationTechnique.techniqueName())).ifPresent(Runnable::run);
Optional.ofNullable(combTechniqueIncrementers.get(combinationTechnique.techniqueName())).ifPresent(Runnable::run);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import static org.opensearch.neuralsearch.search.util.HybridSearchResultFormatUtil.isHybridQueryStartStopElement;

import java.util.Map;
import java.util.stream.Collectors;

import java.util.List;
Expand All @@ -14,8 +15,11 @@

import com.google.common.annotations.VisibleForTesting;
import lombok.Getter;
import org.opensearch.neuralsearch.processor.combination.RRFScoreCombinationTechnique;
import org.opensearch.neuralsearch.processor.combination.ScoreCombinationTechnique;
import org.opensearch.neuralsearch.processor.normalization.ScoreNormalizationTechnique;
import org.opensearch.neuralsearch.stats.events.EventStatName;
import org.opensearch.neuralsearch.stats.events.EventStatsManager;
import org.opensearch.search.fetch.FetchSearchResult;
import org.opensearch.search.pipeline.PipelineProcessingContext;
import org.opensearch.search.query.QuerySearchResult;
Expand Down Expand Up @@ -50,6 +54,11 @@ public class RRFProcessor extends AbstractScoreHybridizationProcessor {
private final ScoreCombinationTechnique combinationTechnique;
private final NormalizationProcessorWorkflow normalizationWorkflow;

private final Map<String, Runnable> combTechniqueIncrementers = Map.of(
RRFScoreCombinationTechnique.TECHNIQUE_NAME,
() -> EventStatsManager.increment(EventStatName.COMB_TECHNIQUE_RRF_EXECUTIONS)
);

/**
* Method abstracts functional aspect of score normalization and score combination. Exact methods for each processing stage
* are set as part of class constructor
Expand All @@ -70,6 +79,7 @@ <Result extends SearchPhaseResult> void hybridizeScores(
Optional<FetchSearchResult> fetchSearchResult = getFetchSearchResults(searchPhaseResult);
boolean explain = Objects.nonNull(searchPhaseContext.getRequest().source().explain())
&& searchPhaseContext.getRequest().source().explain();
recordStats(combinationTechnique);
// make data transfer object to pass in, execute will get object with 4 or 5 fields, depending
// on coming from NormalizationProcessor or RRFProcessor
NormalizationProcessorWorkflowExecuteRequest normalizationExecuteDTO = NormalizationProcessorWorkflowExecuteRequest.builder()
Expand Down Expand Up @@ -143,4 +153,9 @@ <Result extends SearchPhaseResult> Optional<FetchSearchResult> getFetchSearchRes
Optional<Result> optionalFirstSearchPhaseResult = searchPhaseResults.getAtomicArray().asList().stream().findFirst();
return optionalFirstSearchPhaseResult.map(SearchPhaseResult::fetchResult);
}

private void recordStats(ScoreCombinationTechnique combinationTechnique) {
EventStatsManager.increment(EventStatName.RRF_PROCESSOR_EXECUTIONS);
Optional.of(combTechniqueIncrementers.get(combinationTechnique.techniqueName())).ifPresent(Runnable::run);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added this since @martin-gaievski requested, but my opinion is if currently RRF processor only works works with RRF combination technique and RRF normalization technique, then functionally the stats will be identical, and I don't think there's a point to including a breakdown, it will just be duplicated. Combination is configurable with a single option, but normalization technique isn't a configurable in processor config: https://docs.opensearch.org/docs/latest/search-plugins/search-pipelines/score-ranker-processor/

If in the future there are more normalizaiton/score techniques added we can add this granularity then.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having RRF as processor and as technique gives more dimensions for reporting. For instance, with original version you can generate report with breakdown "by processor type". But report with breakdown on something like "by combination technique" will be harder because for RRF you raw data will have combination technique metric with empty value.

Copy link
Contributor Author

@q-andy q-andy May 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see your point, my concern is having duplicated information in the stats API makes it less readable and less performant. Especially as we add more features, we are concerned with possible response bloat, similar the issues core is facing with the size of _nodes/info and _nodes/stats responses causing slowdowns on large clusters. And adding stats is one-way door in the sense that it's difficult to justify removing them since it's a breaking change. Ideally we save granularity for when it is needed and don't add more stats unless necessary

From a report perspective, I'm thinking in terms of what kind of insight a breakdown would give you: If you are trying to determine which score combination techniques are seeing more usage, perhaps proportion of RRF to zscore or minmax isn't comparable since they're categorically different, e.g. used in different processors in different contexts. And if needed, the information is available implicitly from looking at RRF processor stats directly.

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.Optional;

import org.apache.commons.lang3.StringUtils;
import org.opensearch.cluster.metadata.IndexMetadata;
Expand Down Expand Up @@ -53,6 +54,13 @@ public final class TextChunkingProcessor extends AbstractProcessor {
public static final String IGNORE_MISSING = "ignore_missing";
public static final boolean DEFAULT_IGNORE_MISSING = false;

private static final Map<String, Runnable> chunkingAlgorithmIncrementers = Map.of(
DelimiterChunker.ALGORITHM_NAME,
() -> EventStatsManager.increment(EventStatName.TEXT_CHUNKING_DELIMITER_EXECUTIONS),
FixedTokenLengthChunker.ALGORITHM_NAME,
() -> EventStatsManager.increment(EventStatName.TEXT_CHUNKING_FIXED_LENGTH_EXECUTIONS)
);

private int maxChunkLimit;
private Chunker chunker;
private final Map<String, Object> fieldMap;
Expand Down Expand Up @@ -295,9 +303,6 @@ private List<String> chunkLeafType(final Object value, final Map<String, Object>

private void recordChunkingExecutionStats(String algorithmName) {
EventStatsManager.increment(EventStatName.TEXT_CHUNKING_PROCESSOR_EXECUTIONS);
switch (algorithmName) {
case DelimiterChunker.ALGORITHM_NAME -> EventStatsManager.increment(EventStatName.TEXT_CHUNKING_DELIMITER_EXECUTIONS);
case FixedTokenLengthChunker.ALGORITHM_NAME -> EventStatsManager.increment(EventStatName.TEXT_CHUNKING_FIXED_LENGTH_EXECUTIONS);
}
Optional.ofNullable(chunkingAlgorithmIncrementers.get(algorithmName)).ifPresent(Runnable::run);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.ListIterator;
import java.util.Locale;
Expand Down Expand Up @@ -38,6 +39,8 @@
import lombok.Setter;
import lombok.experimental.Accessors;
import lombok.extern.log4j.Log4j2;
import org.opensearch.neuralsearch.stats.events.EventStatName;
import org.opensearch.neuralsearch.stats.events.EventStatsManager;

import static org.opensearch.neuralsearch.common.MinClusterVersionUtil.isClusterOnOrAfterMinReqVersionForPaginationInHybridQuery;

Expand Down Expand Up @@ -270,13 +273,26 @@ public static HybridQueryBuilder fromXContent(XContentParser parser) throws IOEx
if (isClusterOnOrAfterMinReqVersionForPaginationInHybridQuery()) {
compoundQueryBuilder.paginationDepth(paginationDepth);
}

boolean hasInnerHits = false;
for (QueryBuilder query : queries) {
if (filter == null) {
compoundQueryBuilder.add(query);
} else {
compoundQueryBuilder.add(query.filter(filter));
}

// Check if children have inner hits for stats
if (hasInnerHits == false) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this the right comparison?

if children have inner hits, shouldn't this be true?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where are we changing this parameter value?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's set inside the loop, see my other comment for explanation. Basically my thought is we want to increment the stat once if at least one child query has inner hits, which means the hybrid query as a whole is an inner hits hybrid query.

Map<String, InnerHitContextBuilder> innerHits = new HashMap<>();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldn't we emit stats for the inner hits? seems like we're checking for inner hits without doing anything to them. hasInnerHits can be flipped between true/false in the for loop, but we're only emitting once after the loop finishes in line 295. Is this valid?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The thought here is that we care about the stats at the level of the hybrid query. If at least one child query has inner hits then that means the hybrid query is an inner hits query. So we check each child, if at least one has inner hits, then we set hasInnerHits = true and we don't have to keep checking the rest.

InnerHitContextBuilder.extractInnerHits(query, innerHits);
hasInnerHits = innerHits.isEmpty() == false;
}
}

boolean hasFilter = filter != null;
boolean hasPagination = paginationDepth != null;
updateQueryStats(hasFilter, hasPagination, hasInnerHits);
return compoundQueryBuilder;
}

Expand Down Expand Up @@ -409,4 +425,17 @@ protected void extractInnerHitBuilders(Map<String, InnerHitContextBuilder> inner
InnerHitContextBuilder.extractInnerHits(queryBuilder, innerHits);
}
}

private static void updateQueryStats(boolean hasFilter, boolean hasPagination, boolean hasInnerHits) {
EventStatsManager.increment(EventStatName.HYBRID_QUERY_REQUESTS);
if (hasFilter) {
EventStatsManager.increment(EventStatName.HYBRID_QUERY_FILTER_REQUESTS);
}
if (hasPagination) {
EventStatsManager.increment(EventStatName.HYBRID_QUERY_PAGINATION_REQUESTS);
}
if (hasInnerHits) {
EventStatsManager.increment(EventStatName.HYBRID_QUERY_INNER_HITS_REQUESTS);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,31 @@ public enum EventStatName implements StatName {
"semantic_highlighting_request_count",
"semantic_highlighting",
EventStatType.TIMESTAMPED_EVENT_COUNTER
);
),
// Normalization processor stats
NORMALIZATION_PROCESSOR_EXECUTIONS(
"normalization_processor_executions",
"processors.search.hybrid",
EventStatType.TIMESTAMPED_EVENT_COUNTER
),
NORM_TECHNIQUE_L2_EXECUTIONS("norm_l2_executions", "processors.search.hybrid", EventStatType.TIMESTAMPED_EVENT_COUNTER),
NORM_TECHNIQUE_MINMAX_EXECUTIONS("norm_minmax_executions", "processors.search.hybrid", EventStatType.TIMESTAMPED_EVENT_COUNTER),
NORM_TECHNIQUE_NORM_ZSCORE_EXECUTIONS("norm_zscore_executions", "processors.search.hybrid", EventStatType.TIMESTAMPED_EVENT_COUNTER),
COMB_TECHNIQUE_ARITHMETIC_EXECUTIONS("comb_arithmetic_executions", "processors.search.hybrid", EventStatType.TIMESTAMPED_EVENT_COUNTER),
COMB_TECHNIQUE_GEOMETRIC_EXECUTIONS("comb_geometric_executions", "processors.search.hybrid", EventStatType.TIMESTAMPED_EVENT_COUNTER),
COMB_TECHNIQUE_HARMONIC_EXECUTIONS("comb_harmonic_executions", "processors.search.hybrid", EventStatType.TIMESTAMPED_EVENT_COUNTER),
// RRF processor stats
RRF_PROCESSOR_EXECUTIONS(
"rank_based_normalization_processor_executions",
"processors.search.hybrid",
EventStatType.TIMESTAMPED_EVENT_COUNTER
),
COMB_TECHNIQUE_RRF_EXECUTIONS("comb_rrf_executions", "processors.search.hybrid", EventStatType.TIMESTAMPED_EVENT_COUNTER),
// Hybrid query stats
HYBRID_QUERY_REQUESTS("hybrid_query_requests", "query.hybrid", EventStatType.TIMESTAMPED_EVENT_COUNTER),
HYBRID_QUERY_INNER_HITS_REQUESTS("hybrid_query_with_inner_hits_requests", "query.hybrid", EventStatType.TIMESTAMPED_EVENT_COUNTER),
HYBRID_QUERY_FILTER_REQUESTS("hybrid_query_with_filter_requests", "query.hybrid", EventStatType.TIMESTAMPED_EVENT_COUNTER),
HYBRID_QUERY_PAGINATION_REQUESTS("hybrid_query_with_pagination_requests", "query.hybrid", EventStatType.TIMESTAMPED_EVENT_COUNTER),;

private final String nameString;
private final String path;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,18 @@ public enum InfoStatName implements StatName {
TEXT_EMBEDDING_SKIP_EXISTING_PROCESSORS("text_embedding_skip_existing_processors", "processors.ingest", InfoStatType.INFO_COUNTER),
TEXT_CHUNKING_PROCESSORS("text_chunking_processors", "processors.ingest", InfoStatType.INFO_COUNTER),
TEXT_CHUNKING_DELIMITER_PROCESSORS("text_chunking_delimiter_processors", "processors.ingest", InfoStatType.INFO_COUNTER),
TEXT_CHUNKING_FIXED_LENGTH_PROCESSORS("text_chunking_fixed_length_processors", "processors.ingest", InfoStatType.INFO_COUNTER);
TEXT_CHUNKING_FIXED_LENGTH_PROCESSORS("text_chunking_fixed_length_processors", "processors.ingest", InfoStatType.INFO_COUNTER),
// Normalization processor
NORMALIZATION_PROCESSORS("normalization_processors", "processors.search.hybrid", InfoStatType.INFO_COUNTER),
NORM_TECHNIQUE_L2_PROCESSORS("norm_l2_processors", "processors.search.hybrid", InfoStatType.INFO_COUNTER),
NORM_TECHNIQUE_MINMAX_PROCESSORS("norm_minmax_processors", "processors.search.hybrid", InfoStatType.INFO_COUNTER),
NORM_TECHNIQUE_ZSCORE_PROCESSORS("norm_zscore_processors", "processors.search.hybrid", InfoStatType.INFO_COUNTER),
COMB_TECHNIQUE_ARITHMETIC_PROCESSORS("comb_arithmetic_processors", "processors.search.hybrid", InfoStatType.INFO_COUNTER),
COMB_TECHNIQUE_GEOMETRIC_PROCESSORS("comb_geometric_processors", "processors.search.hybrid", InfoStatType.INFO_COUNTER),
COMB_TECHNIQUE_HARMONIC_PROCESSORS("comb_harmonic_processors", "processors.search.hybrid", InfoStatType.INFO_COUNTER),
// RRF processor
RRF_PROCESSORS("rank_based_normalization_processors", "processors.search.hybrid", InfoStatType.INFO_COUNTER),
COMB_TECHNIQUE_RRF_PROCESSORS("comb_rrf_processors", "processors.search.hybrid", InfoStatType.INFO_COUNTER),;

private final String nameString;
private final String path;
Expand Down
Loading
Loading