|
| 1 | +/* |
| 2 | + * Licensed to Elasticsearch under one or more contributor |
| 3 | + * license agreements. See the NOTICE file distributed with |
| 4 | + * this work for additional information regarding copyright |
| 5 | + * ownership. Elasticsearch licenses this file to you under |
| 6 | + * the Apache License, Version 2.0 (the "License"); you may |
| 7 | + * not use this file except in compliance with the License. |
| 8 | + * You may obtain a copy of the License at |
| 9 | + * |
| 10 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | + * |
| 12 | + * Unless required by applicable law or agreed to in writing, |
| 13 | + * software distributed under the License is distributed on an |
| 14 | + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 15 | + * KIND, either express or implied. See the License for the |
| 16 | + * specific language governing permissions and limitations |
| 17 | + * under the License. |
| 18 | + */ |
| 19 | + |
| 20 | +package org.apache.lucene.search.uhighlight; |
| 21 | + |
| 22 | +import org.apache.lucene.analysis.Analyzer; |
| 23 | +import org.apache.lucene.index.Term; |
| 24 | +import org.apache.lucene.queries.CommonTermsQuery; |
| 25 | +import org.apache.lucene.search.DocIdSetIterator; |
| 26 | +import org.apache.lucene.search.IndexSearcher; |
| 27 | +import org.apache.lucene.search.PrefixQuery; |
| 28 | +import org.apache.lucene.search.Query; |
| 29 | +import org.apache.lucene.search.TermQuery; |
| 30 | +import org.apache.lucene.search.highlight.Snippet; |
| 31 | +import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; |
| 32 | +import org.apache.lucene.search.spans.SpanNearQuery; |
| 33 | +import org.apache.lucene.search.spans.SpanOrQuery; |
| 34 | +import org.apache.lucene.search.spans.SpanQuery; |
| 35 | +import org.apache.lucene.search.spans.SpanTermQuery; |
| 36 | +import org.elasticsearch.common.Nullable; |
| 37 | +import org.elasticsearch.common.lucene.all.AllTermQuery; |
| 38 | +import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery; |
| 39 | +import org.elasticsearch.common.lucene.search.function.FiltersFunctionScoreQuery; |
| 40 | +import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery; |
| 41 | + |
| 42 | +import java.io.IOException; |
| 43 | +import java.text.BreakIterator; |
| 44 | +import java.util.ArrayList; |
| 45 | +import java.util.Collection; |
| 46 | +import java.util.Collections; |
| 47 | +import java.util.List; |
| 48 | +import java.util.Locale; |
| 49 | +import java.util.Map; |
| 50 | + |
| 51 | +/** |
| 52 | + * Subclass of the {@link UnifiedHighlighter} that works for a single field in a single document. |
| 53 | + * Uses a custom {@link PassageFormatter}. Accepts field content as a constructor |
| 54 | + * argument, given that loadings field value can be done reading from _source field. |
| 55 | + * Supports using different {@link BreakIterator} to break the text into fragments. Considers every distinct field |
| 56 | + * value as a discrete passage for highlighting (unless the whole content needs to be highlighted). |
| 57 | + * Supports both returning empty snippets and non highlighted snippets when no highlighting can be performed. |
| 58 | + */ |
| 59 | +public class CustomUnifiedHighlighter extends UnifiedHighlighter { |
| 60 | + private static final Snippet[] EMPTY_SNIPPET = new Snippet[0]; |
| 61 | + |
| 62 | + private final String fieldValue; |
| 63 | + private final PassageFormatter passageFormatter; |
| 64 | + private final BreakIterator breakIterator; |
| 65 | + private final boolean returnNonHighlightedSnippets; |
| 66 | + |
| 67 | + /** |
| 68 | + * Creates a new instance of {@link CustomUnifiedHighlighter} |
| 69 | + * |
| 70 | + * @param analyzer the analyzer used for the field at index time, used for multi term queries internally |
| 71 | + * @param passageFormatter our own {@link CustomPassageFormatter} |
| 72 | + * which generates snippets in forms of {@link Snippet} objects |
| 73 | + * @param breakIterator the {@link BreakIterator} to use for dividing text into passages. |
| 74 | + * If null {@link BreakIterator#getSentenceInstance(Locale)} is used. |
| 75 | + * @param fieldValue the original field values as constructor argument, loaded from the _source field or |
| 76 | + * the relevant stored field. |
| 77 | + * @param returnNonHighlightedSnippets whether non highlighted snippets should be |
| 78 | + * returned rather than empty snippets when no highlighting can be performed |
| 79 | + */ |
| 80 | + public CustomUnifiedHighlighter(IndexSearcher searcher, |
| 81 | + Analyzer analyzer, |
| 82 | + PassageFormatter passageFormatter, |
| 83 | + @Nullable BreakIterator breakIterator, |
| 84 | + String fieldValue, |
| 85 | + boolean returnNonHighlightedSnippets) { |
| 86 | + super(searcher, analyzer); |
| 87 | + this.breakIterator = breakIterator; |
| 88 | + this.passageFormatter = passageFormatter; |
| 89 | + this.fieldValue = fieldValue; |
| 90 | + this.returnNonHighlightedSnippets = returnNonHighlightedSnippets; |
| 91 | + } |
| 92 | + |
| 93 | + /** |
| 94 | + * Highlights terms extracted from the provided query within the content of the provided field name |
| 95 | + */ |
| 96 | + public Snippet[] highlightField(String field, Query query, int docId, int maxPassages) throws IOException { |
| 97 | + Map<String, Object[]> fieldsAsObjects = super.highlightFieldsAsObjects(new String[]{field}, query, |
| 98 | + new int[]{docId}, new int[]{maxPassages}); |
| 99 | + Object[] snippetObjects = fieldsAsObjects.get(field); |
| 100 | + if (snippetObjects != null) { |
| 101 | + //one single document at a time |
| 102 | + assert snippetObjects.length == 1; |
| 103 | + Object snippetObject = snippetObjects[0]; |
| 104 | + if (snippetObject != null && snippetObject instanceof Snippet[]) { |
| 105 | + return (Snippet[]) snippetObject; |
| 106 | + } |
| 107 | + } |
| 108 | + return EMPTY_SNIPPET; |
| 109 | + } |
| 110 | + |
| 111 | + @Override |
| 112 | + protected List<CharSequence[]> loadFieldValues(String[] fields, DocIdSetIterator docIter, |
| 113 | + int cacheCharsThreshold) throws IOException { |
| 114 | + //we only highlight one field, one document at a time |
| 115 | + return Collections.singletonList(new String[]{fieldValue}); |
| 116 | + } |
| 117 | + |
| 118 | + @Override |
| 119 | + protected BreakIterator getBreakIterator(String field) { |
| 120 | + if (breakIterator != null) { |
| 121 | + return breakIterator; |
| 122 | + } |
| 123 | + return super.getBreakIterator(field); |
| 124 | + } |
| 125 | + |
| 126 | + @Override |
| 127 | + protected PassageFormatter getFormatter(String field) { |
| 128 | + return passageFormatter; |
| 129 | + } |
| 130 | + |
| 131 | + @Override |
| 132 | + protected int getMaxNoHighlightPassages(String field) { |
| 133 | + if (returnNonHighlightedSnippets) { |
| 134 | + return 1; |
| 135 | + } |
| 136 | + return 0; |
| 137 | + } |
| 138 | + |
| 139 | + @Override |
| 140 | + protected Collection<Query> preMultiTermQueryRewrite(Query query) { |
| 141 | + return rewriteCustomQuery(query); |
| 142 | + } |
| 143 | + |
| 144 | + @Override |
| 145 | + protected Collection<Query> preSpanQueryRewrite(Query query) { |
| 146 | + return rewriteCustomQuery(query); |
| 147 | + } |
| 148 | + |
| 149 | + |
| 150 | + /** |
| 151 | + * Translate custom queries in queries that are supported by the unified highlighter. |
| 152 | + */ |
| 153 | + private Collection<Query> rewriteCustomQuery(Query query) { |
| 154 | + if (query instanceof MultiPhrasePrefixQuery) { |
| 155 | + MultiPhrasePrefixQuery mpq = (MultiPhrasePrefixQuery) query; |
| 156 | + Term[][] terms = mpq.getTerms(); |
| 157 | + int[] positions = mpq.getPositions(); |
| 158 | + SpanQuery[] positionSpanQueries = new SpanQuery[positions.length]; |
| 159 | + int sizeMinus1 = terms.length - 1; |
| 160 | + for (int i = 0; i < positions.length; i++) { |
| 161 | + SpanQuery[] innerQueries = new SpanQuery[terms[i].length]; |
| 162 | + for (int j = 0; j < terms[i].length; j++) { |
| 163 | + if (i == sizeMinus1) { |
| 164 | + innerQueries[j] = new SpanMultiTermQueryWrapper(new PrefixQuery(terms[i][j])); |
| 165 | + } else { |
| 166 | + innerQueries[j] = new SpanTermQuery(terms[i][j]); |
| 167 | + } |
| 168 | + } |
| 169 | + if (innerQueries.length > 1) { |
| 170 | + positionSpanQueries[i] = new SpanOrQuery(innerQueries); |
| 171 | + } else { |
| 172 | + positionSpanQueries[i] = innerQueries[0]; |
| 173 | + } |
| 174 | + } |
| 175 | + // sum position increments beyond 1 |
| 176 | + int positionGaps = 0; |
| 177 | + if (positions.length >= 2) { |
| 178 | + // positions are in increasing order. max(0,...) is just a safeguard. |
| 179 | + positionGaps = Math.max(0, positions[positions.length - 1] - positions[0] - positions.length + 1); |
| 180 | + } |
| 181 | + |
| 182 | + //if original slop is 0 then require inOrder |
| 183 | + boolean inorder = (mpq.getSlop() == 0); |
| 184 | + return Collections.singletonList(new SpanNearQuery(positionSpanQueries, |
| 185 | + mpq.getSlop() + positionGaps, inorder)); |
| 186 | + } else if (query instanceof CommonTermsQuery) { |
| 187 | + CommonTermsQuery ctq = (CommonTermsQuery) query; |
| 188 | + List<Query> tqs = new ArrayList<> (); |
| 189 | + for (Term term : ctq.getTerms()) { |
| 190 | + tqs.add(new TermQuery(term)); |
| 191 | + } |
| 192 | + return tqs; |
| 193 | + } else if (query instanceof AllTermQuery) { |
| 194 | + AllTermQuery atq = (AllTermQuery) query; |
| 195 | + return Collections.singletonList(new TermQuery(atq.getTerm())); |
| 196 | + } else if (query instanceof FunctionScoreQuery) { |
| 197 | + return Collections.singletonList(((FunctionScoreQuery) query).getSubQuery()); |
| 198 | + } else if (query instanceof FiltersFunctionScoreQuery) { |
| 199 | + return Collections.singletonList(((FiltersFunctionScoreQuery) query).getSubQuery()); |
| 200 | + } else { |
| 201 | + return null; |
| 202 | + } |
| 203 | + } |
| 204 | +} |
0 commit comments