Add stopword support to IntervalBuilder (#39637)

The match interval builder analyses input text and converts it to an IntervalSource, and as such
may generate token streams with stopwords. This commit deals with these by using the extend
factory to cover the gaps produced by these stopwords so that phrase and ordered queries work
correctly.
This commit is contained in:
Alan Woodward 2019-03-05 10:31:44 +00:00 committed by Alan Woodward
parent e7dbfda5d3
commit 0b14782b23
2 changed files with 101 additions and 11 deletions

View File

@ -143,38 +143,50 @@ public class IntervalBuilder {
protected List<IntervalsSource> analyzeTerms(TokenStream ts) throws IOException {
List<IntervalsSource> terms = new ArrayList<>();
TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class);
ts.reset();
while (ts.incrementToken()) {
BytesRef term = bytesAtt.getBytesRef();
terms.add(Intervals.term(BytesRef.deepCopyOf(term)));
int precedingSpaces = posAtt.getPositionIncrement() - 1;
terms.add(extend(Intervals.term(BytesRef.deepCopyOf(term)), precedingSpaces));
}
ts.end();
return terms;
}
public static IntervalsSource extend(IntervalsSource source, int precedingSpaces) {
if (precedingSpaces == 0) {
return source;
}
return Intervals.extend(source, precedingSpaces, 0);
}
protected IntervalsSource analyzeSynonyms(TokenStream ts, int maxGaps, boolean ordered) throws IOException {
List<IntervalsSource> terms = new ArrayList<>();
List<IntervalsSource> synonyms = new ArrayList<>();
TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class);
ts.reset();
int spaces = 0;
while (ts.incrementToken()) {
if (posAtt.getPositionIncrement() == 1) {
int posInc = posAtt.getPositionIncrement();
if (posInc > 0) {
if (synonyms.size() == 1) {
terms.add(synonyms.get(0));
terms.add(extend(synonyms.get(0), spaces));
}
else if (synonyms.size() > 1) {
terms.add(Intervals.or(synonyms.toArray(new IntervalsSource[0])));
terms.add(extend(Intervals.or(synonyms.toArray(new IntervalsSource[0])), spaces));
}
synonyms.clear();
spaces = posInc - 1;
}
synonyms.add(Intervals.term(BytesRef.deepCopyOf(bytesAtt.getBytesRef())));
}
if (synonyms.size() == 1) {
terms.add(synonyms.get(0));
terms.add(extend(synonyms.get(0), spaces));
}
else {
terms.add(Intervals.or(synonyms.toArray(new IntervalsSource[0])));
terms.add(extend(Intervals.or(synonyms.toArray(new IntervalsSource[0])), spaces));
}
return combineSources(terms, maxGaps, ordered);
}

View File

@ -94,6 +94,22 @@ public class IntervalBuilderTests extends ESTestCase {
}
public void testPhraseWithStopword() throws IOException {
CannedTokenStream ts = new CannedTokenStream(
new Token("term1", 1, 1, 2),
new Token("term3", 2, 5, 6)
);
IntervalsSource source = BUILDER.analyzeText(new CachingTokenFilter(ts), 0, true);
IntervalsSource expected = Intervals.phrase(
Intervals.term("term1"), Intervals.extend(Intervals.term("term3"), 1, 0)
);
assertEquals(expected, source);
}
public void testSimpleSynonyms() throws IOException {
CannedTokenStream ts = new CannedTokenStream(
@ -112,16 +128,32 @@ public class IntervalBuilderTests extends ESTestCase {
}
public void testSimpleSynonymsWithGap() throws IOException {
// term1 [] term2/term3/term4 term5
CannedTokenStream ts = new CannedTokenStream(
new Token("term1", 1, 2),
new Token("term2", 2, 3, 4),
new Token("term3", 0, 3, 4),
new Token("term4", 0, 3, 4),
new Token("term5", 5, 6)
);
IntervalsSource source = BUILDER.analyzeText(new CachingTokenFilter(ts), -1, true);
IntervalsSource expected = Intervals.ordered(
Intervals.term("term1"),
Intervals.extend(Intervals.or(Intervals.term("term2"), Intervals.term("term3"), Intervals.term("term4")), 1, 0),
Intervals.term("term5")
);
assertEquals(expected, source);
}
public void testGraphSynonyms() throws IOException {
// term1 term2/term3:2 term4 term5
Token graphToken = new Token("term2", 3, 4);
graphToken.setPositionLength(2);
// term1 term2:2/term3 term4 term5
CannedTokenStream ts = new CannedTokenStream(
new Token("term1", 1, 2),
graphToken,
new Token("term2", 1, 3, 4, 2),
new Token("term3", 0, 3, 4),
new Token("term4", 5, 6),
new Token("term5", 6, 7)
@ -138,4 +170,50 @@ public class IntervalBuilderTests extends ESTestCase {
}
public void testGraphSynonymsWithGaps() throws IOException {
// term1 [] term2:4/term3 [] [] term4 term5
CannedTokenStream ts = new CannedTokenStream(
new Token("term1", 1, 2),
new Token("term2", 2, 3, 4, 4),
new Token("term3", 0, 3, 4),
new Token("term4", 3, 5, 6),
new Token("term5", 6, 7)
);
IntervalsSource source = BUILDER.analyzeText(new CachingTokenFilter(ts), -1, true);
IntervalsSource expected = Intervals.ordered(
Intervals.term("term1"),
Intervals.or(
Intervals.extend(Intervals.term("term2"), 1, 0),
Intervals.phrase(
Intervals.extend(Intervals.term("term3"), 1, 0),
Intervals.extend(Intervals.term("term4"), 2, 0))),
Intervals.term("term5")
);
assertEquals(expected, source);
}
public void testGraphTerminatesOnGap() throws IOException {
// term1 term2:2/term3 term4 [] term5
CannedTokenStream ts = new CannedTokenStream(
new Token("term1", 1, 2),
new Token("term2", 1, 2, 3, 2),
new Token("term3", 0, 2, 3),
new Token("term4", 2, 3),
new Token("term5", 2, 6, 7)
);
IntervalsSource source = BUILDER.analyzeText(new CachingTokenFilter(ts), -1, true);
IntervalsSource expected = Intervals.ordered(
Intervals.term("term1"),
Intervals.or(Intervals.term("term2"), Intervals.phrase("term3", "term4")),
Intervals.extend(Intervals.term("term5"), 1, 0)
);
assertEquals(expected, source);
}
}