Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -143,38 +143,50 @@ protected static IntervalsSource combineSources(List<IntervalsSource> sources, i
protected List<IntervalsSource> analyzeTerms(TokenStream ts) throws IOException {
List<IntervalsSource> terms = new ArrayList<>();
TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class);
ts.reset();
while (ts.incrementToken()) {
BytesRef term = bytesAtt.getBytesRef();
terms.add(Intervals.term(BytesRef.deepCopyOf(term)));
int precedingSpaces = posAtt.getPositionIncrement() - 1;
terms.add(extend(Intervals.term(BytesRef.deepCopyOf(term)), precedingSpaces));
}
ts.end();
return terms;
}

public static IntervalsSource extend(IntervalsSource source, int precedingSpaces) {
if (precedingSpaces == 0) {
return source;
}
return Intervals.extend(source, precedingSpaces, 0);
}

protected IntervalsSource analyzeSynonyms(TokenStream ts, int maxGaps, boolean ordered) throws IOException {
List<IntervalsSource> terms = new ArrayList<>();
List<IntervalsSource> synonyms = new ArrayList<>();
TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class);
ts.reset();
int spaces = 0;
while (ts.incrementToken()) {
if (posAtt.getPositionIncrement() == 1) {
int posInc = posAtt.getPositionIncrement();
if (posInc > 0) {
if (synonyms.size() == 1) {
terms.add(synonyms.get(0));
terms.add(extend(synonyms.get(0), spaces));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The synonyms contains the word at the position before the gap so the extend should be applied forward ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It applies backwards because we're using PositionIncrement to detect when there's a gap preceding us in the TokenStream. If you've got a posInc of 2, that means theres a gap before you in the stream, so you need to extend backwards 1 to cover it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, actually no, it's a bug - I added the test you suggested and it fails, have updated.

}
else if (synonyms.size() > 1) {
terms.add(Intervals.or(synonyms.toArray(new IntervalsSource[0])));
terms.add(extend(Intervals.or(synonyms.toArray(new IntervalsSource[0])), spaces));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here, you should use extend(0, spaces) and not extend(spaces, 0) ?

}
synonyms.clear();
spaces = posInc - 1;
}
synonyms.add(Intervals.term(BytesRef.deepCopyOf(bytesAtt.getBytesRef())));
}
if (synonyms.size() == 1) {
terms.add(synonyms.get(0));
terms.add(extend(synonyms.get(0), spaces));
}
else {
terms.add(Intervals.or(synonyms.toArray(new IntervalsSource[0])));
terms.add(extend(Intervals.or(synonyms.toArray(new IntervalsSource[0])), spaces));
}
return combineSources(terms, maxGaps, ordered);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,22 @@ public void testPhrase() throws IOException {

}

public void testPhraseWithStopword() throws IOException {

CannedTokenStream ts = new CannedTokenStream(
new Token("term1", 1, 1, 2),
new Token("term3", 2, 5, 6)
);

IntervalsSource source = BUILDER.analyzeText(new CachingTokenFilter(ts), 0, true);
IntervalsSource expected = Intervals.phrase(
Intervals.term("term1"), Intervals.extend(Intervals.term("term3"), 1, 0)
);

assertEquals(expected, source);

}

public void testSimpleSynonyms() throws IOException {

CannedTokenStream ts = new CannedTokenStream(
Expand All @@ -112,16 +128,32 @@ public void testSimpleSynonyms() throws IOException {

}

public void testGraphSynonyms() throws IOException {
public void testSimpleSynonymsWithGap() throws IOException {
// term1 [] term2/term3/term4 term5
CannedTokenStream ts = new CannedTokenStream(
new Token("term1", 1, 2),
new Token("term2", 2, 3, 4),
new Token("term3", 0, 3, 4),
new Token("term4", 0, 3, 4),
new Token("term5", 5, 6)
);

IntervalsSource source = BUILDER.analyzeText(new CachingTokenFilter(ts), -1, true);
IntervalsSource expected = Intervals.ordered(
Intervals.term("term1"),
Intervals.extend(Intervals.or(Intervals.term("term2"), Intervals.term("term3"), Intervals.term("term4")), 1, 0),
Intervals.term("term5")
);
assertEquals(expected, source);
}

// term1 term2/term3:2 term4 term5
public void testGraphSynonyms() throws IOException {

Token graphToken = new Token("term2", 3, 4);
graphToken.setPositionLength(2);
// term1 term2:2/term3 term4 term5

CannedTokenStream ts = new CannedTokenStream(
new Token("term1", 1, 2),
graphToken,
new Token("term2", 1, 3, 4, 2),
new Token("term3", 0, 3, 4),
new Token("term4", 5, 6),
new Token("term5", 6, 7)
Expand All @@ -138,4 +170,50 @@ public void testGraphSynonyms() throws IOException {

}

public void testGraphSynonymsWithGaps() throws IOException {

// term1 [] term2:4/term3 [] [] term4 term5

CannedTokenStream ts = new CannedTokenStream(
new Token("term1", 1, 2),
new Token("term2", 2, 3, 4, 4),
new Token("term3", 0, 3, 4),
new Token("term4", 3, 5, 6),
new Token("term5", 6, 7)
);

IntervalsSource source = BUILDER.analyzeText(new CachingTokenFilter(ts), -1, true);
IntervalsSource expected = Intervals.ordered(
Intervals.term("term1"),
Intervals.or(
Intervals.extend(Intervals.term("term2"), 1, 0),
Intervals.phrase(
Intervals.extend(Intervals.term("term3"), 1, 0),
Intervals.extend(Intervals.term("term4"), 2, 0))),
Intervals.term("term5")
);

assertEquals(expected, source);

}

public void testGraphTerminatesOnGap() throws IOException {
// term1 term2:2/term3 term4 [] term5
CannedTokenStream ts = new CannedTokenStream(
new Token("term1", 1, 2),
new Token("term2", 1, 2, 3, 2),
new Token("term3", 0, 2, 3),
new Token("term4", 2, 3),
new Token("term5", 2, 6, 7)
);

IntervalsSource source = BUILDER.analyzeText(new CachingTokenFilter(ts), -1, true);
IntervalsSource expected = Intervals.ordered(
Intervals.term("term1"),
Intervals.or(Intervals.term("term2"), Intervals.phrase("term3", "term4")),
Intervals.extend(Intervals.term("term5"), 1, 0)
);
assertEquals(expected, source);
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you also add a test with a simple word synonym that starts after a gap ?


}