Skip to content

Commit

Permalink
Addressed code review concerns from @Bill-hbrhbr
Browse files Browse the repository at this point in the history
  • Loading branch information
jackluo923 committed Apr 30, 2024
1 parent 0b99d64 commit e4023bf
Show file tree
Hide file tree
Showing 6 changed files with 33 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,9 @@ public MutableRoaringBitmap getDocIds(String searchQuery) {
Callable<MutableRoaringBitmap> searchCallable = () -> {
IndexSearcher indexSearcher = null;
try {
// Lucene Query Parser can be stateful, we will instantiate a new instance for each query
// Lucene Query Parser is JavaCC based. It is stateful and should
// be instantiated per query. Analyzer on the other hand is stateless
// and can be created upfront.
QueryParserBase parser = _queryParserClassConstructor.newInstance(_column, _analyzer);
if (_enablePrefixSuffixMatchingInPhraseQueries) {
// TODO: this code path is semi-broken as the default QueryParser does not always utilizes the analyzer
Expand Down Expand Up @@ -204,7 +206,7 @@ private Constructor<QueryParserBase> getQueryParserWithStringAndAnalyzerTypeCons
try {
queryParserClass.getConstructor(String.class, Analyzer.class);
} catch (NoSuchMethodException ex) {
throw new ReflectiveOperationException("The specified lucene query parser class " + queryParserClassName
throw new NoSuchMethodException("The specified lucene query parser class " + queryParserClassName
+ " is not assignable from does not have the required constructor method with parameter type "
+ "[String.class, Analyzer.class]"
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,6 @@ public static Analyzer getAnalyzer(TextIndexConfig config) throws ReflectiveOper

// Initialize the custom analyzer class with custom analyzer args
Class<?> luceneAnalyzerClass = Class.forName(luceneAnalyzerClassName);
Analyzer analyzer;
if (!Analyzer.class.isAssignableFrom(luceneAnalyzerClass)) {
String exceptionMessage = "Custom analyzer must be a child of " + Analyzer.class.getCanonicalName();
LOGGER.error(exceptionMessage);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import org.apache.pinot.segment.local.realtime.impl.invertedindex.RealtimeLuceneTextIndexSearcherPool;
import org.apache.pinot.segment.local.segment.index.column.PhysicalColumnIndexContainer;
import org.apache.pinot.segment.local.segment.index.loader.IndexLoadingConfig;
import org.apache.pinot.segment.local.segment.index.text.TextIndexConfigBuilder;
import org.apache.pinot.segment.local.segment.store.SegmentLocalFSDirectory;
import org.apache.pinot.segment.local.segment.virtualcolumn.VirtualColumnProviderFactory;
import org.apache.pinot.segment.spi.ColumnMetadata;
Expand All @@ -52,6 +53,7 @@
import org.apache.pinot.segment.spi.index.metadata.SegmentMetadataImpl;
import org.apache.pinot.segment.spi.index.reader.TextIndexReader;
import org.apache.pinot.segment.spi.store.SegmentDirectory;
import org.apache.pinot.spi.config.table.FSTType;
import org.apache.pinot.spi.config.table.FieldConfig;
import org.apache.pinot.spi.config.table.IndexConfig;
import org.apache.pinot.spi.config.table.IndexingConfig;
Expand Down Expand Up @@ -84,6 +86,7 @@ public class RealtimeSegmentConverterTest {
private static final String LONG_COLUMN4 = "long_col4";
private static final String MV_INT_COLUMN = "mv_col";
private static final String DATE_TIME_COLUMN = "date_time_col";
private static final FSTType NULL_FST_TYPE = null;

private static final File TMP_DIR =
new File(FileUtils.getTempDirectory(), RealtimeSegmentConverterTest.class.getName());
Expand Down Expand Up @@ -472,9 +475,7 @@ public void testSegmentBuilderWithReuse(boolean columnMajorSegmentBuilder, Strin
String tableNameWithType = tableConfig.getTableName();
String segmentName = "testTable__0__0__123456";
IndexingConfig indexingConfig = tableConfig.getIndexingConfig();
TextIndexConfig textIndexConfig =
new TextIndexConfig(false, null, null, false, false, Collections.emptyList(), Collections.emptyList(), false,
500, null, null, null, null, false);
TextIndexConfig textIndexConfig = new TextIndexConfigBuilder(NULL_FST_TYPE).build();

RealtimeSegmentConfig.Builder realtimeSegmentConfigBuilder =
new RealtimeSegmentConfig.Builder().setTableNameWithType(tableNameWithType).setSegmentName(segmentName)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ public TextIndexConfig(@JsonProperty("disabled") Boolean disabled, @JsonProperty
@JsonProperty("luceneMaxBufferSizeMB") Integer luceneMaxBufferSizeMB,
@JsonProperty("luceneAnalyzerClass") String luceneAnalyzerClass,
@JsonProperty("luceneAnalyzerClassArgs") String luceneAnalyzerClassArgs,
@JsonProperty("luceneAnalyzerClassArgTypes") List<String> luceneAnalyzerClassArgTypes,
@JsonProperty("luceneAnalyzerClassArgTypes") String luceneAnalyzerClassArgTypes,
@JsonProperty("luceneQueryParserClass") String luceneQueryParserClass,
@JsonProperty("enablePrefixSuffixMatchingInPhraseQueries") Boolean enablePrefixSuffixMatchingInPhraseQueries) {
super(disabled);
Expand All @@ -87,8 +87,7 @@ public TextIndexConfig(@JsonProperty("disabled") Boolean disabled, @JsonProperty
// List<String>. This is because the args may contain comma and other special characters such as space. Therefore,
// we use our own csv parser to parse the values directly.
_luceneAnalyzerClassArgs = CsvParser.parse(luceneAnalyzerClassArgs, true, false);
_luceneAnalyzerClassArgTypes =
luceneAnalyzerClassArgTypes == null ? Collections.emptyList() : luceneAnalyzerClassArgTypes;
_luceneAnalyzerClassArgTypes = CsvParser.parse(luceneAnalyzerClassArgTypes, false, true);
_luceneQueryParserClass = luceneQueryParserClass == null
? FieldConfig.TEXT_INDEX_DEFAULT_LUCENE_QUERY_PARSER_CLASS : luceneQueryParserClass;
_enablePrefixSuffixMatchingInPhraseQueries =
Expand Down Expand Up @@ -219,8 +218,9 @@ public AbstractBuilder(TextIndexConfig other) {
public TextIndexConfig build() {
return new TextIndexConfig(false, _fstType, _rawValueForTextIndex, _enableQueryCache, _useANDForMultiTermQueries,
_stopWordsInclude, _stopWordsExclude, _luceneUseCompoundFile, _luceneMaxBufferSizeMB, _luceneAnalyzerClass,
String.join(",", _luceneAnalyzerClassArgs), _luceneAnalyzerClassArgTypes, _luceneQueryParserClass,
_enablePrefixSuffixMatchingInPhraseQueries);
CsvParser.serialize(_luceneAnalyzerClassArgs, false, false),
CsvParser.serialize(_luceneAnalyzerClassArgTypes, false, false),
_luceneQueryParserClass, _enablePrefixSuffixMatchingInPhraseQueries);
}

public abstract AbstractBuilder withProperties(@Nullable Map<String, String> textIndexProperties);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ private CsvParser() {
* and other white space characters. These characters are sometimes expected to be part of the actual argument.
*
* @param input string to split on comma
* @param escapeComma whether we should ignore "\," during splitting, replace it with "," after split
* @param escapeComma if true, we do not split on escaped commas, and we replace "\," with "," after the split
* @param trim whether we should trim each tokenized terms
* @return a list of values, empty list if input is empty or null
*/
Expand All @@ -59,4 +59,21 @@ public static List<String> parse(@Nullable String input, boolean escapeComma, bo

return tokenStream.collect(Collectors.toList());
}

/**
* Parse the input list of string with customized serialization behavior.
* @param input containing a list of string to be serialized
* @param escapeComma if true, escape commas by replacing "," with "\," before the join
* @param trim whether we should trim each tokenized terms before serialization
* @return serialized string representing the input list of string
*/
public static String serialize(List<String> input, boolean escapeComma, boolean trim) {
Stream<String> tokenStream = input.stream();
if (escapeComma) {
tokenStream = tokenStream.map(s -> s.replaceAll(",", "\\,"));
} if (trim) {
tokenStream = tokenStream.map(String::trim);
}
return tokenStream.collect(Collectors.joining(","));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,11 @@ public class FieldConfig extends BaseJsonConfig {
public static final String TEXT_INDEX_LUCENE_USE_COMPOUND_FILE = "luceneUseCompoundFile";
public static final String TEXT_INDEX_LUCENE_MAX_BUFFER_SIZE_MB = "luceneMaxBufferSizeMB";
public static final String TEXT_INDEX_LUCENE_ANALYZER_CLASS = "luceneAnalyzerClass";
public static final String TEXT_INDEX_DEFAULT_LUCENE_ANALYZER_CLASS =
"org.apache.lucene.analysis.standard.StandardAnalyzer";
public static final String TEXT_INDEX_LUCENE_ANALYZER_CLASS_ARGS = "luceneAnalyzerClassArgs";
public static final String TEXT_INDEX_LUCENE_ANALYZER_CLASS_ARG_TYPES = "luceneAnalyzerClassArgTypes";
public static final String TEXT_INDEX_LUCENE_QUERY_PARSER_CLASS = "luceneQueryParserClass";
public static final String TEXT_INDEX_DEFAULT_LUCENE_ANALYZER_CLASS =
"org.apache.lucene.analysis.standard.StandardAnalyzer";
public static final String TEXT_INDEX_DEFAULT_LUCENE_QUERY_PARSER_CLASS =
"org.apache.lucene.queryparser.classic.QueryParser";
public static final String TEXT_INDEX_STOP_WORD_SEPERATOR = ",";
Expand Down

0 comments on commit e4023bf

Please sign in to comment.