Skip to content

Commit

Permalink
avoid IllegalArgumentException in Lucene on sentences with >255 chars (
Browse files Browse the repository at this point in the history
  • Loading branch information
danielnaber committed Feb 8, 2016
1 parent 8d9050a commit b563ec7
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 2 deletions.
3 changes: 3 additions & 0 deletions languagetool-standalone/CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@
#### Embedded HTTPS server
* Speed up for input with short sentences

#### Wikipedia
* Indexing: fixed an `IllegalArgumentException` for long sentences
(https://github.com/languagetool-org/languagetool/issues/364)


## 3.2 (2015-12-29)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ public final class LanguageToolFilter extends TokenFilter {
private final OffsetAttribute offsetAtt;
private final PositionIncrementAttribute posIncrAtt;
private final TypeAttribute typeAtt;
private final StringBuilder collectedInput = new StringBuilder();

private AttributeSource.State current;
private Iterator<AnalyzedTokenReadings> tokenIter;
Expand Down Expand Up @@ -84,8 +85,17 @@ public boolean incrementToken() throws IOException {
// there are no remaining tokens from the current sentence... are there more sentences?
if (input.incrementToken()) {
// a new sentence is available: process it.
final AnalyzedSentence sentence = languageTool.getAnalyzedSentence(termAtt.toString());

String sentenceStr = termAtt.toString();
collectedInput.append(sentenceStr);
if (sentenceStr.length() >= 255) {
// Long sentences get split, so keep collecting data to avoid errors
// later. See https://github.com/languagetool-org/languagetool/issues/364
return true;
} else {
sentenceStr = collectedInput.toString();
collectedInput.setLength(0);
}
final AnalyzedSentence sentence = languageTool.getAnalyzedSentence(sentenceStr);
final List<AnalyzedTokenReadings> tokenBuffer = Arrays.asList(sentence.getTokens());
tokenIter = tokenBuffer.iterator();
/*
Expand Down

0 comments on commit b563ec7

Please sign in to comment.