From 87647f06df4ed7bf80883c2f9d438e1825eeca04 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Mon, 18 Feb 2019 14:45:35 +0000 Subject: [PATCH 1/4] Use Intervals.extend() to preserve stopword gaps when building interval queries - requires LUCENE-8697 --- .../index/query/IntervalBuilder.java | 23 ++++++-- .../index/query/IntervalBuilderTests.java | 56 +++++++++++++++++-- 2 files changed, 68 insertions(+), 11 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/query/IntervalBuilder.java b/server/src/main/java/org/elasticsearch/index/query/IntervalBuilder.java index b39f2ab5a91e6..6e0746b2571df 100644 --- a/server/src/main/java/org/elasticsearch/index/query/IntervalBuilder.java +++ b/server/src/main/java/org/elasticsearch/index/query/IntervalBuilder.java @@ -143,15 +143,24 @@ protected static IntervalsSource combineSources(List sources, i protected List analyzeTerms(TokenStream ts) throws IOException { List terms = new ArrayList<>(); TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class); + PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class); ts.reset(); while (ts.incrementToken()) { BytesRef term = bytesAtt.getBytesRef(); - terms.add(Intervals.term(BytesRef.deepCopyOf(term))); + int precedingSpaces = posAtt.getPositionIncrement() - 1; + terms.add(extend(Intervals.term(BytesRef.deepCopyOf(term)), precedingSpaces)); } ts.end(); return terms; } + public static IntervalsSource extend(IntervalsSource source, int precedingSpaces) { + if (precedingSpaces == 0) { + return source; + } + return Intervals.extend(source, precedingSpaces, 0); + } + protected IntervalsSource analyzeSynonyms(TokenStream ts, int maxGaps, boolean ordered) throws IOException { List terms = new ArrayList<>(); List synonyms = new ArrayList<>(); @@ -159,22 +168,24 @@ protected IntervalsSource analyzeSynonyms(TokenStream ts, int maxGaps, boolean o PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class); ts.reset(); while (ts.incrementToken()) { - if (posAtt.getPositionIncrement() == 1) { + int spaces = posAtt.getPositionIncrement() - 1; + if (spaces >= 0) { if (synonyms.size() == 1) { - terms.add(synonyms.get(0)); + terms.add(extend(synonyms.get(0), spaces)); } else if (synonyms.size() > 1) { - terms.add(Intervals.or(synonyms.toArray(new IntervalsSource[0]))); + terms.add(extend(Intervals.or(synonyms.toArray(new IntervalsSource[0])), spaces)); } synonyms.clear(); } synonyms.add(Intervals.term(BytesRef.deepCopyOf(bytesAtt.getBytesRef()))); } + int spaces = posAtt.getPositionIncrement() - 1; if (synonyms.size() == 1) { - terms.add(synonyms.get(0)); + terms.add(extend(synonyms.get(0), spaces)); } else { - terms.add(Intervals.or(synonyms.toArray(new IntervalsSource[0]))); + terms.add(extend(Intervals.or(synonyms.toArray(new IntervalsSource[0])), spaces)); } return combineSources(terms, maxGaps, ordered); } diff --git a/server/src/test/java/org/elasticsearch/index/query/IntervalBuilderTests.java b/server/src/test/java/org/elasticsearch/index/query/IntervalBuilderTests.java index a565db41516a9..a00846f892ac2 100644 --- a/server/src/test/java/org/elasticsearch/index/query/IntervalBuilderTests.java +++ b/server/src/test/java/org/elasticsearch/index/query/IntervalBuilderTests.java @@ -94,6 +94,22 @@ public void testPhrase() throws IOException { } + public void testPhraseWithStopword() throws IOException { + + CannedTokenStream ts = new CannedTokenStream( + new Token("term1", 1, 1, 2), + new Token("term3", 2, 5, 6) + ); + + IntervalsSource source = BUILDER.analyzeText(new CachingTokenFilter(ts), 0, true); + IntervalsSource expected = Intervals.phrase( + Intervals.term("term1"), Intervals.extend(Intervals.term("term3"), 1, 0) + ); + + assertEquals(expected, source); + + } + public void testSimpleSynonyms() throws IOException { CannedTokenStream ts = new CannedTokenStream( @@ -112,16 +128,19 @@ public void testSimpleSynonyms() throws IOException { } - public void testGraphSynonyms() throws IOException { + private static Token graphToken(String text, int posInc, int posLen, int start, int end) { + Token token = new Token(text, posInc, start, end); + token.setPositionLength(posLen); + return token; + } - // term1 term2/term3:2 term4 term5 + public void testGraphSynonyms() throws IOException { - Token graphToken = new Token("term2", 3, 4); - graphToken.setPositionLength(2); + // term1 term2:2/term3 term4 term5 CannedTokenStream ts = new CannedTokenStream( new Token("term1", 1, 2), - graphToken, + graphToken("term2", 1, 2, 3, 4), new Token("term3", 0, 3, 4), new Token("term4", 5, 6), new Token("term5", 6, 7) @@ -138,4 +157,31 @@ public void testGraphSynonyms() throws IOException { } + public void testGraphSynonymsWithGaps() throws IOException { + + // term1 [] term2:4/term3 [] [] term4 term5 + + CannedTokenStream ts = new CannedTokenStream( + new Token("term1", 1, 2), + graphToken("term2", 2, 4, 3, 4), + new Token("term3", 0, 3, 4), + new Token("term4", 3, 5, 6), + new Token("term5", 6, 7) + ); + + IntervalsSource source = BUILDER.analyzeText(new CachingTokenFilter(ts), -1, true); + IntervalsSource expected = Intervals.ordered( + Intervals.term("term1"), + Intervals.or( + Intervals.extend(Intervals.term("term2"), 1, 0), + Intervals.phrase( + Intervals.extend(Intervals.term("term3"), 1, 0), + Intervals.extend(Intervals.term("term4"), 2, 0))), + Intervals.term("term5") + ); + + assertEquals(expected, source); + + } + } From 3ccac7478983f04ef3ef904d171552cad00c80d3 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Mon, 4 Mar 2019 11:45:49 +0000 Subject: [PATCH 2/4] cleanup --- .../index/query/IntervalBuilderTests.java | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/index/query/IntervalBuilderTests.java b/server/src/test/java/org/elasticsearch/index/query/IntervalBuilderTests.java index a00846f892ac2..b73f1b82b2844 100644 --- a/server/src/test/java/org/elasticsearch/index/query/IntervalBuilderTests.java +++ b/server/src/test/java/org/elasticsearch/index/query/IntervalBuilderTests.java @@ -128,19 +128,13 @@ public void testSimpleSynonyms() throws IOException { } - private static Token graphToken(String text, int posInc, int posLen, int start, int end) { - Token token = new Token(text, posInc, start, end); - token.setPositionLength(posLen); - return token; - } - public void testGraphSynonyms() throws IOException { // term1 term2:2/term3 term4 term5 CannedTokenStream ts = new CannedTokenStream( new Token("term1", 1, 2), - graphToken("term2", 1, 2, 3, 4), + new Token("term2", 1, 3, 4, 2), new Token("term3", 0, 3, 4), new Token("term4", 5, 6), new Token("term5", 6, 7) @@ -163,7 +157,7 @@ public void testGraphSynonymsWithGaps() throws IOException { CannedTokenStream ts = new CannedTokenStream( new Token("term1", 1, 2), - graphToken("term2", 2, 4, 3, 4), + new Token("term2", 2, 3, 4, 4), new Token("term3", 0, 3, 4), new Token("term4", 3, 5, 6), new Token("term5", 6, 7) From 3b7c1d4124c07177eefc9d006d903c230bb4c46f Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Mon, 4 Mar 2019 12:05:46 +0000 Subject: [PATCH 3/4] More tests --- .../index/query/IntervalBuilderTests.java | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/server/src/test/java/org/elasticsearch/index/query/IntervalBuilderTests.java b/server/src/test/java/org/elasticsearch/index/query/IntervalBuilderTests.java index b73f1b82b2844..83f6731580e0c 100644 --- a/server/src/test/java/org/elasticsearch/index/query/IntervalBuilderTests.java +++ b/server/src/test/java/org/elasticsearch/index/query/IntervalBuilderTests.java @@ -178,4 +178,23 @@ public void testGraphSynonymsWithGaps() throws IOException { } + public void testGraphTerminatesOnGap() throws IOException { + // term1 term2:2/term3 term4 [] term5 + CannedTokenStream ts = new CannedTokenStream( + new Token("term1", 1, 2), + new Token("term2", 1, 2, 3, 2), + new Token("term3", 0, 2, 3), + new Token("term4", 2, 3), + new Token("term5", 2, 6, 7) + ); + + IntervalsSource source = BUILDER.analyzeText(new CachingTokenFilter(ts), -1, true); + IntervalsSource expected = Intervals.ordered( + Intervals.term("term1"), + Intervals.or(Intervals.term("term2"), Intervals.phrase("term3", "term4")), + Intervals.extend(Intervals.term("term5"), 1, 0) + ); + assertEquals(expected, source); + } + } From b8178d91428cc012b8e5ae45afd8619fc99e2c4b Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Mon, 4 Mar 2019 13:37:51 +0000 Subject: [PATCH 4/4] Fix simple synonynms --- .../index/query/IntervalBuilder.java | 7 ++++--- .../index/query/IntervalBuilderTests.java | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/query/IntervalBuilder.java b/server/src/main/java/org/elasticsearch/index/query/IntervalBuilder.java index 6e0746b2571df..e174b3fd49eee 100644 --- a/server/src/main/java/org/elasticsearch/index/query/IntervalBuilder.java +++ b/server/src/main/java/org/elasticsearch/index/query/IntervalBuilder.java @@ -167,9 +167,10 @@ protected IntervalsSource analyzeSynonyms(TokenStream ts, int maxGaps, boolean o TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class); ts.reset(); + int spaces = 0; while (ts.incrementToken()) { - int spaces = posAtt.getPositionIncrement() - 1; - if (spaces >= 0) { + int posInc = posAtt.getPositionIncrement(); + if (posInc > 0) { if (synonyms.size() == 1) { terms.add(extend(synonyms.get(0), spaces)); } @@ -177,10 +178,10 @@ else if (synonyms.size() > 1) { terms.add(extend(Intervals.or(synonyms.toArray(new IntervalsSource[0])), spaces)); } synonyms.clear(); + spaces = posInc - 1; } synonyms.add(Intervals.term(BytesRef.deepCopyOf(bytesAtt.getBytesRef()))); } - int spaces = posAtt.getPositionIncrement() - 1; if (synonyms.size() == 1) { terms.add(extend(synonyms.get(0), spaces)); } diff --git a/server/src/test/java/org/elasticsearch/index/query/IntervalBuilderTests.java b/server/src/test/java/org/elasticsearch/index/query/IntervalBuilderTests.java index 83f6731580e0c..15ec8af0af2c5 100644 --- a/server/src/test/java/org/elasticsearch/index/query/IntervalBuilderTests.java +++ b/server/src/test/java/org/elasticsearch/index/query/IntervalBuilderTests.java @@ -128,6 +128,25 @@ public void testSimpleSynonyms() throws IOException { } + public void testSimpleSynonymsWithGap() throws IOException { + // term1 [] term2/term3/term4 term5 + CannedTokenStream ts = new CannedTokenStream( + new Token("term1", 1, 2), + new Token("term2", 2, 3, 4), + new Token("term3", 0, 3, 4), + new Token("term4", 0, 3, 4), + new Token("term5", 5, 6) + ); + + IntervalsSource source = BUILDER.analyzeText(new CachingTokenFilter(ts), -1, true); + IntervalsSource expected = Intervals.ordered( + Intervals.term("term1"), + Intervals.extend(Intervals.or(Intervals.term("term2"), Intervals.term("term3"), Intervals.term("term4")), 1, 0), + Intervals.term("term5") + ); + assertEquals(expected, source); + } + public void testGraphSynonyms() throws IOException { // term1 term2:2/term3 term4 term5