Skip to content

Commit

Permalink
- fix lucene not being able to search special characters
Browse files Browse the repository at this point in the history
- StandardAnalyzer does not preserve whitespace and does not allow special character search
  • Loading branch information
derreisende77 committed Nov 1, 2024
1 parent 9cd540f commit 39fd308
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 5 deletions.
14 changes: 13 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,19 @@
<artifactId>lucene-queryparser</artifactId>
<version>${lucene.version}</version>
</dependency>

<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analysis-common</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queries</artifactId>
<version>${lucene.version}</version>
</dependency>



<dependency>
<groupId>org.jfree</groupId>
<artifactId>jfreechart</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,21 @@
/*
* Copyright (c) 2024 derreisende77.
* This code was developed as part of the MediathekView project https://github.com/mediathekview/MediathekView
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package mediathek.gui.tabs.tab_film.helpers;

import com.google.common.base.Stopwatch;
Expand All @@ -11,11 +29,12 @@
import mediathek.javafx.filterpanel.FilterActionPanel;
import mediathek.javafx.filterpanel.ZeitraumSpinner;
import mediathek.mainwindow.MediathekGui;
import mediathek.tool.LuceneDefaultAnalyzer;
import mediathek.tool.SwingErrorDialog;
import mediathek.tool.models.TModelFilm;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
Expand All @@ -42,7 +61,7 @@ public class LuceneGuiFilmeModelHelper extends GuiModelHelper {
PARSER_CONFIG_MAP.put(LuceneIndexKeys.FILM_LENGTH, new PointsConfig(new DecimalFormat(), Integer.class));
}

private final StandardAnalyzer analyzer = new StandardAnalyzer();
private final Analyzer analyzer = LuceneDefaultAnalyzer.buildAnalyzer();

public LuceneGuiFilmeModelHelper(@NotNull FilterActionPanel filterActionPanel,
@NotNull SeenHistoryController historyController,
Expand Down
22 changes: 20 additions & 2 deletions src/main/java/mediathek/gui/tasks/LuceneIndexWorker.java
Original file line number Diff line number Diff line change
@@ -1,3 +1,21 @@
/*
* Copyright (c) 2024 derreisende77.
* This code was developed as part of the MediathekView project https://github.com/mediathekview/MediathekView
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package mediathek.gui.tasks;

import com.google.common.base.Stopwatch;
Expand All @@ -7,11 +25,11 @@
import mediathek.daten.IndexedFilmList;
import mediathek.mainwindow.MediathekGui;
import mediathek.tool.FileUtils;
import mediathek.tool.LuceneDefaultAnalyzer;
import mediathek.tool.SwingErrorDialog;
import mediathek.tool.datum.DateUtil;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
Expand Down Expand Up @@ -86,7 +104,7 @@ protected Void doInBackground() {
});

//index filmlist after blacklist only
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(new StandardAnalyzer());
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(LuceneDefaultAnalyzer.buildAnalyzer());
indexWriterConfig.setRAMBufferSizeMB(256d);

try (var writer = new IndexWriter(filmListe.getLuceneDirectory(), indexWriterConfig)) {
Expand Down
48 changes: 48 additions & 0 deletions src/main/java/mediathek/tool/LuceneDefaultAnalyzer.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/*
* Copyright (c) 2024 derreisende77.
* This code was developed as part of the MediathekView project https://github.com/mediathekview/MediathekView
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package mediathek.tool;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;

import java.io.IOException;

public class LuceneDefaultAnalyzer {
private LuceneDefaultAnalyzer() {}
private static final Logger logger = LogManager.getLogger();

public static Analyzer buildAnalyzer() {
Analyzer analyzer;
try {
analyzer = CustomAnalyzer.builder()
.withTokenizer("whitespace")
.addTokenFilter("lowercase")
.build();
}
catch (IOException e) {
logger.error("Could not build custom analyzer", e);
logger.error("Falling back to standard analyzer");
analyzer = new StandardAnalyzer();
}
return analyzer;
}
}

0 comments on commit 39fd308

Please sign in to comment.