Addressed code review concerns from @Bill-hbrhbr

y-scope · Apr 30, 2024 · e4023bf · e4023bf
1 parent 0b99d64
commit e4023bf
Show file tree

Hide file tree

Showing 6 changed files with 33 additions and 14 deletions.
diff --git a/...a/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeLuceneTextIndex.java b/...a/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeLuceneTextIndex.java
@@ -126,7 +126,9 @@ public MutableRoaringBitmap getDocIds(String searchQuery) {
     Callable<MutableRoaringBitmap> searchCallable = () -> {
       IndexSearcher indexSearcher = null;
       try {
-        // Lucene Query Parser can be stateful, we will instantiate a new instance for each query
+        // Lucene Query Parser is JavaCC based. It is stateful and should
+        // be instantiated per query. Analyzer on the other hand is stateless
+        // and can be created upfront.
         QueryParserBase parser = _queryParserClassConstructor.newInstance(_column, _analyzer);
         if (_enablePrefixSuffixMatchingInPhraseQueries) {
           // TODO: this code path is semi-broken as the default QueryParser does not always utilizes the analyzer
@@ -204,7 +206,7 @@ private Constructor<QueryParserBase> getQueryParserWithStringAndAnalyzerTypeCons
     try {
       queryParserClass.getConstructor(String.class, Analyzer.class);
     } catch (NoSuchMethodException ex) {
-      throw new ReflectiveOperationException("The specified lucene query parser class " + queryParserClassName
+      throw new NoSuchMethodException("The specified lucene query parser class " + queryParserClassName
               + " is not assignable from does not have the required constructor method with parameter type "
               + "[String.class, Analyzer.class]"
         );

diff --git a/...ment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java b/...ment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java
@@ -146,7 +146,6 @@ public static Analyzer getAnalyzer(TextIndexConfig config) throws ReflectiveOper
 
       // Initialize the custom analyzer class with custom analyzer args
       Class<?> luceneAnalyzerClass = Class.forName(luceneAnalyzerClassName);
-      Analyzer analyzer;
       if (!Analyzer.class.isAssignableFrom(luceneAnalyzerClass)) {
         String exceptionMessage = "Custom analyzer must be a child of " + Analyzer.class.getCanonicalName();
         LOGGER.error(exceptionMessage);

diff --git a/.../java/org/apache/pinot/segment/local/realtime/converter/RealtimeSegmentConverterTest.java b/.../java/org/apache/pinot/segment/local/realtime/converter/RealtimeSegmentConverterTest.java
@@ -41,6 +41,7 @@
 import org.apache.pinot.segment.local.realtime.impl.invertedindex.RealtimeLuceneTextIndexSearcherPool;
 import org.apache.pinot.segment.local.segment.index.column.PhysicalColumnIndexContainer;
 import org.apache.pinot.segment.local.segment.index.loader.IndexLoadingConfig;
+import org.apache.pinot.segment.local.segment.index.text.TextIndexConfigBuilder;
 import org.apache.pinot.segment.local.segment.store.SegmentLocalFSDirectory;
 import org.apache.pinot.segment.local.segment.virtualcolumn.VirtualColumnProviderFactory;
 import org.apache.pinot.segment.spi.ColumnMetadata;
@@ -52,6 +53,7 @@
 import org.apache.pinot.segment.spi.index.metadata.SegmentMetadataImpl;
 import org.apache.pinot.segment.spi.index.reader.TextIndexReader;
 import org.apache.pinot.segment.spi.store.SegmentDirectory;
+import org.apache.pinot.spi.config.table.FSTType;
 import org.apache.pinot.spi.config.table.FieldConfig;
 import org.apache.pinot.spi.config.table.IndexConfig;
 import org.apache.pinot.spi.config.table.IndexingConfig;
@@ -84,6 +86,7 @@ public class RealtimeSegmentConverterTest {
   private static final String LONG_COLUMN4 = "long_col4";
   private static final String MV_INT_COLUMN = "mv_col";
   private static final String DATE_TIME_COLUMN = "date_time_col";
+  private static final FSTType NULL_FST_TYPE = null;
 
   private static final File TMP_DIR =
       new File(FileUtils.getTempDirectory(), RealtimeSegmentConverterTest.class.getName());
@@ -472,9 +475,7 @@ public void testSegmentBuilderWithReuse(boolean columnMajorSegmentBuilder, Strin
     String tableNameWithType = tableConfig.getTableName();
     String segmentName = "testTable__0__0__123456";
     IndexingConfig indexingConfig = tableConfig.getIndexingConfig();
-    TextIndexConfig textIndexConfig =
-        new TextIndexConfig(false, null, null, false, false, Collections.emptyList(), Collections.emptyList(), false,
-            500, null, null, null, null, false);
+    TextIndexConfig textIndexConfig = new TextIndexConfigBuilder(NULL_FST_TYPE).build();
 
     RealtimeSegmentConfig.Builder realtimeSegmentConfigBuilder =
         new RealtimeSegmentConfig.Builder().setTableNameWithType(tableNameWithType).setSegmentName(segmentName)

diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/TextIndexConfig.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/TextIndexConfig.java
@@ -66,7 +66,7 @@ public TextIndexConfig(@JsonProperty("disabled") Boolean disabled, @JsonProperty
       @JsonProperty("luceneMaxBufferSizeMB") Integer luceneMaxBufferSizeMB,
       @JsonProperty("luceneAnalyzerClass") String luceneAnalyzerClass,
       @JsonProperty("luceneAnalyzerClassArgs") String luceneAnalyzerClassArgs,
-      @JsonProperty("luceneAnalyzerClassArgTypes") List<String> luceneAnalyzerClassArgTypes,
+      @JsonProperty("luceneAnalyzerClassArgTypes") String luceneAnalyzerClassArgTypes,
       @JsonProperty("luceneQueryParserClass") String luceneQueryParserClass,
       @JsonProperty("enablePrefixSuffixMatchingInPhraseQueries") Boolean enablePrefixSuffixMatchingInPhraseQueries) {
     super(disabled);
@@ -87,8 +87,7 @@ public TextIndexConfig(@JsonProperty("disabled") Boolean disabled, @JsonProperty
     // List<String>. This is because the args may contain comma and other special characters such as space. Therefore,
     // we use our own csv parser to parse the values directly.
     _luceneAnalyzerClassArgs = CsvParser.parse(luceneAnalyzerClassArgs, true, false);
-    _luceneAnalyzerClassArgTypes =
-        luceneAnalyzerClassArgTypes == null ? Collections.emptyList() : luceneAnalyzerClassArgTypes;
+    _luceneAnalyzerClassArgTypes = CsvParser.parse(luceneAnalyzerClassArgTypes, false, true);
     _luceneQueryParserClass = luceneQueryParserClass == null
             ? FieldConfig.TEXT_INDEX_DEFAULT_LUCENE_QUERY_PARSER_CLASS : luceneQueryParserClass;
     _enablePrefixSuffixMatchingInPhraseQueries =
@@ -219,8 +218,9 @@ public AbstractBuilder(TextIndexConfig other) {
     public TextIndexConfig build() {
       return new TextIndexConfig(false, _fstType, _rawValueForTextIndex, _enableQueryCache, _useANDForMultiTermQueries,
           _stopWordsInclude, _stopWordsExclude, _luceneUseCompoundFile, _luceneMaxBufferSizeMB, _luceneAnalyzerClass,
-          String.join(",", _luceneAnalyzerClassArgs), _luceneAnalyzerClassArgTypes, _luceneQueryParserClass,
-          _enablePrefixSuffixMatchingInPhraseQueries);
+          CsvParser.serialize(_luceneAnalyzerClassArgs, false, false),
+          CsvParser.serialize(_luceneAnalyzerClassArgTypes, false, false),
+          _luceneQueryParserClass, _enablePrefixSuffixMatchingInPhraseQueries);
     }
 
     public abstract AbstractBuilder withProperties(@Nullable Map<String, String> textIndexProperties);

diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/utils/CsvParser.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/utils/CsvParser.java
@@ -35,7 +35,7 @@ private CsvParser() {
      * and other white space characters. These characters are sometimes expected to be part of the actual argument.
      *
      * @param input  string to split on comma
-     * @param escapeComma whether we should ignore "\," during splitting, replace it with "," after split
+     * @param escapeComma if true, we do not split on escaped commas, and we replace "\," with "," after the split
      * @param trim   whether we should trim each tokenized terms
      * @return a list of values, empty list if input is empty or null
      */
@@ -59,4 +59,21 @@ public static List<String> parse(@Nullable String input, boolean escapeComma, bo
 
         return tokenStream.collect(Collectors.toList());
     }
+
+    /**
+     * Parse the input list of string with customized serialization behavior.
+     * @param input containing a list of string to be serialized
+     * @param escapeComma if true, escape commas by replacing "," with "\," before the join
+     * @param trim whether we should trim each tokenized terms before serialization
+     * @return serialized string representing the input list of string
+     */
+    public static String serialize(List<String> input, boolean escapeComma, boolean trim) {
+        Stream<String> tokenStream = input.stream();
+        if (escapeComma) {
+            tokenStream = tokenStream.map(s -> s.replaceAll(",", "\\,"));
+        } if (trim) {
+            tokenStream = tokenStream.map(String::trim);
+        }
+        return tokenStream.collect(Collectors.joining(","));
+    }
 }
diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java b/pinot-spi/src/main/java/org/apache/pinot/spi/config/table/FieldConfig.java
@@ -52,11 +52,11 @@ public class FieldConfig extends BaseJsonConfig {
   public static final String TEXT_INDEX_LUCENE_USE_COMPOUND_FILE = "luceneUseCompoundFile";
   public static final String TEXT_INDEX_LUCENE_MAX_BUFFER_SIZE_MB = "luceneMaxBufferSizeMB";
   public static final String TEXT_INDEX_LUCENE_ANALYZER_CLASS = "luceneAnalyzerClass";
-  public static final String TEXT_INDEX_DEFAULT_LUCENE_ANALYZER_CLASS =
-      "org.apache.lucene.analysis.standard.StandardAnalyzer";
   public static final String TEXT_INDEX_LUCENE_ANALYZER_CLASS_ARGS = "luceneAnalyzerClassArgs";
   public static final String TEXT_INDEX_LUCENE_ANALYZER_CLASS_ARG_TYPES = "luceneAnalyzerClassArgTypes";
   public static final String TEXT_INDEX_LUCENE_QUERY_PARSER_CLASS = "luceneQueryParserClass";
+  public static final String TEXT_INDEX_DEFAULT_LUCENE_ANALYZER_CLASS =
+      "org.apache.lucene.analysis.standard.StandardAnalyzer";
   public static final String TEXT_INDEX_DEFAULT_LUCENE_QUERY_PARSER_CLASS =
       "org.apache.lucene.queryparser.classic.QueryParser";
   public static final String TEXT_INDEX_STOP_WORD_SEPERATOR = ",";