From 5b56ee5556ca41b708ba8e0a3048d37d12cb3b59 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Wed, 31 Jul 2024 03:08:14 +0800 Subject: [PATCH] Addressed comments from @chenboat --- .../RealtimeLuceneTextIndex.java | 8 +-- .../readers/text/LuceneTextIndexReader.java | 4 +- .../local/segment/store/TextIndexUtils.java | 60 +++++++++++-------- 3 files changed, 38 insertions(+), 34 deletions(-) diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeLuceneTextIndex.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeLuceneTextIndex.java index 4507987c824e..7c97bca68f91 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeLuceneTextIndex.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeLuceneTextIndex.java @@ -127,9 +127,7 @@ public MutableRoaringBitmap getDocIds(String searchQuery) { Callable searchCallable = () -> { IndexSearcher indexSearcher = null; try { - // Lucene Query Parser is JavaCC based. It is stateful and should - // be instantiated per query. Analyzer on the other hand is stateless - // and can be created upfront. + // Lucene query parsers are generally stateful and a new instance must be created per query. QueryParserBase parser = _queryParserClassConstructor.newInstance(_column, _analyzer); if (_enablePrefixSuffixMatchingInPhraseQueries) { // Note: Lucene's built-in QueryParser has limited wildcard functionality in phrase queries. It does not use @@ -212,8 +210,8 @@ private Constructor getQueryParserWithStringAndAnalyzerTypeCons queryParserClass.getConstructor(String.class, Analyzer.class); } catch (NoSuchMethodException ex) { throw new NoSuchMethodException("The specified lucene query parser class " + queryParserClassName - + " is not assignable from does not have the required constructor method with parameter type " - + "[String.class, Analyzer.class]" + + " is not assignable because the class does not have the required constructor method with parameter " + + "type [String.class, Analyzer.class]" ); } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/text/LuceneTextIndexReader.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/text/LuceneTextIndexReader.java index 9ba2a2ab0af5..9e47f1cbd649 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/text/LuceneTextIndexReader.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/text/LuceneTextIndexReader.java @@ -154,9 +154,7 @@ public MutableRoaringBitmap getDocIds(String searchQuery) { MutableRoaringBitmap docIds = new MutableRoaringBitmap(); Collector docIDCollector = new LuceneDocIdCollector(docIds, _docIdTranslator); try { - // Lucene Query Parser is JavaCC based. It is stateful and should - // be instantiated per query. Analyzer on the other hand is stateless - // and can be created upfront. + // Lucene query parsers are generally stateful and a new instance must be created per query. QueryParserBase parser = _queryParserClassConstructor.newInstance(_column, _analyzer); // Phrase search with prefix/suffix matching may have leading *. E.g., `*pache pinot` which can be stripped by // the query parser. To support the feature, we need to explicitly set the config to be true. diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java index 39d41c92b074..a5a883879037 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java @@ -115,6 +115,14 @@ private static List parseEntryAsString(@Nullable Map col .collect(Collectors.toList()); } + /** + * Retrieves the Lucene Analyzer class instance via reflection from the fully qualified class name of the text config. + * If the class name is not specified in the config, the default StandardAnalyzer is instantiated. + * + * @param config Pinot TextIndexConfig to fetch the configuration from + * @return Lucene Analyzer class instance + * @throws ReflectiveOperationException if instantiation via reflection fails + */ public static Analyzer getAnalyzer(TextIndexConfig config) throws ReflectiveOperationException { String luceneAnalyzerClassName = config.getLuceneAnalyzerClass(); List luceneAnalyzerClassArgs = config.getLuceneAnalyzerClassArgs(); @@ -127,37 +135,37 @@ public static Analyzer getAnalyzer(TextIndexConfig config) throws ReflectiveOper // use existing logic to obtain an instance of StandardAnalyzer with customized stop words return TextIndexUtils.getStandardAnalyzerWithCustomizedStopWords( config.getStopWordsInclude(), config.getStopWordsExclude()); - } else { - // Custom analyzer + custom configs via reflection - if (luceneAnalyzerClassArgs.size() != luceneAnalyzerClassArgTypes.size()) { - throw new ReflectiveOperationException("Mismatch of the number of analyzer arguments and arguments types."); - } + } - // Generate args type list - List> argClasses = new ArrayList<>(); - for (String argType : luceneAnalyzerClassArgTypes) { - argClasses.add(parseSupportedTypes(argType)); - } + // Custom analyzer + custom configs via reflection + if (luceneAnalyzerClassArgs.size() != luceneAnalyzerClassArgTypes.size()) { + throw new ReflectiveOperationException("Mismatch of the number of analyzer arguments and arguments types."); + } - // Best effort coercion to the analyzer argument type - // Note only a subset of class types is supported, unsupported ones can be added in the future - List argValues = new ArrayList<>(); - for (int i = 0; i < luceneAnalyzerClassArgs.size(); i++) { - argValues.add(parseSupportedTypeValues(luceneAnalyzerClassArgs.get(i), argClasses.get(i))); - } + // Generate args type list + List> argClasses = new ArrayList<>(); + for (String argType : luceneAnalyzerClassArgTypes) { + argClasses.add(parseSupportedTypes(argType)); + } - // Initialize the custom analyzer class with custom analyzer args - Class luceneAnalyzerClass = Class.forName(luceneAnalyzerClassName); - if (!Analyzer.class.isAssignableFrom(luceneAnalyzerClass)) { - String exceptionMessage = "Custom analyzer must be a child of " + Analyzer.class.getCanonicalName(); - LOGGER.error(exceptionMessage); - throw new ReflectiveOperationException(exceptionMessage); - } + // Best effort coercion to the analyzer argument type + // Note only a subset of class types is supported, unsupported ones can be added in the future + List argValues = new ArrayList<>(); + for (int i = 0; i < luceneAnalyzerClassArgs.size(); i++) { + argValues.add(parseSupportedTypeValues(luceneAnalyzerClassArgs.get(i), argClasses.get(i))); + } - // Return a new instance of custom lucene analyzer class - return (Analyzer) luceneAnalyzerClass.getConstructor(argClasses.toArray(new Class[0])) - .newInstance(argValues.toArray(new Object[0])); + // Initialize the custom analyzer class with custom analyzer args + Class luceneAnalyzerClass = Class.forName(luceneAnalyzerClassName); + if (!Analyzer.class.isAssignableFrom(luceneAnalyzerClass)) { + String exceptionMessage = "Custom analyzer must be a child of " + Analyzer.class.getCanonicalName(); + LOGGER.error(exceptionMessage); + throw new ReflectiveOperationException(exceptionMessage); } + + // Return a new instance of custom lucene analyzer class + return (Analyzer) luceneAnalyzerClass.getConstructor(argClasses.toArray(new Class[0])) + .newInstance(argValues.toArray(new Object[0])); } /**