Add stopword support to IntervalBuilder (#39637)
The match interval builder analyses input text and converts it to an IntervalSource, and as such may generate token streams with stopwords. This commit deals with these by using the extend factory to cover the gaps produced by these stopwords so that phrase and ordered queries work correctly.
This commit is contained in:
parent
e7dbfda5d3
commit
0b14782b23
|
@ -143,38 +143,50 @@ public class IntervalBuilder {
|
|||
protected List<IntervalsSource> analyzeTerms(TokenStream ts) throws IOException {
|
||||
List<IntervalsSource> terms = new ArrayList<>();
|
||||
TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
||||
PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
BytesRef term = bytesAtt.getBytesRef();
|
||||
terms.add(Intervals.term(BytesRef.deepCopyOf(term)));
|
||||
int precedingSpaces = posAtt.getPositionIncrement() - 1;
|
||||
terms.add(extend(Intervals.term(BytesRef.deepCopyOf(term)), precedingSpaces));
|
||||
}
|
||||
ts.end();
|
||||
return terms;
|
||||
}
|
||||
|
||||
public static IntervalsSource extend(IntervalsSource source, int precedingSpaces) {
|
||||
if (precedingSpaces == 0) {
|
||||
return source;
|
||||
}
|
||||
return Intervals.extend(source, precedingSpaces, 0);
|
||||
}
|
||||
|
||||
protected IntervalsSource analyzeSynonyms(TokenStream ts, int maxGaps, boolean ordered) throws IOException {
|
||||
List<IntervalsSource> terms = new ArrayList<>();
|
||||
List<IntervalsSource> synonyms = new ArrayList<>();
|
||||
TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
|
||||
PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
||||
ts.reset();
|
||||
int spaces = 0;
|
||||
while (ts.incrementToken()) {
|
||||
if (posAtt.getPositionIncrement() == 1) {
|
||||
int posInc = posAtt.getPositionIncrement();
|
||||
if (posInc > 0) {
|
||||
if (synonyms.size() == 1) {
|
||||
terms.add(synonyms.get(0));
|
||||
terms.add(extend(synonyms.get(0), spaces));
|
||||
}
|
||||
else if (synonyms.size() > 1) {
|
||||
terms.add(Intervals.or(synonyms.toArray(new IntervalsSource[0])));
|
||||
terms.add(extend(Intervals.or(synonyms.toArray(new IntervalsSource[0])), spaces));
|
||||
}
|
||||
synonyms.clear();
|
||||
spaces = posInc - 1;
|
||||
}
|
||||
synonyms.add(Intervals.term(BytesRef.deepCopyOf(bytesAtt.getBytesRef())));
|
||||
}
|
||||
if (synonyms.size() == 1) {
|
||||
terms.add(synonyms.get(0));
|
||||
terms.add(extend(synonyms.get(0), spaces));
|
||||
}
|
||||
else {
|
||||
terms.add(Intervals.or(synonyms.toArray(new IntervalsSource[0])));
|
||||
terms.add(extend(Intervals.or(synonyms.toArray(new IntervalsSource[0])), spaces));
|
||||
}
|
||||
return combineSources(terms, maxGaps, ordered);
|
||||
}
|
||||
|
|
|
@ -94,6 +94,22 @@ public class IntervalBuilderTests extends ESTestCase {
|
|||
|
||||
}
|
||||
|
||||
public void testPhraseWithStopword() throws IOException {
|
||||
|
||||
CannedTokenStream ts = new CannedTokenStream(
|
||||
new Token("term1", 1, 1, 2),
|
||||
new Token("term3", 2, 5, 6)
|
||||
);
|
||||
|
||||
IntervalsSource source = BUILDER.analyzeText(new CachingTokenFilter(ts), 0, true);
|
||||
IntervalsSource expected = Intervals.phrase(
|
||||
Intervals.term("term1"), Intervals.extend(Intervals.term("term3"), 1, 0)
|
||||
);
|
||||
|
||||
assertEquals(expected, source);
|
||||
|
||||
}
|
||||
|
||||
public void testSimpleSynonyms() throws IOException {
|
||||
|
||||
CannedTokenStream ts = new CannedTokenStream(
|
||||
|
@ -112,16 +128,32 @@ public class IntervalBuilderTests extends ESTestCase {
|
|||
|
||||
}
|
||||
|
||||
public void testSimpleSynonymsWithGap() throws IOException {
|
||||
// term1 [] term2/term3/term4 term5
|
||||
CannedTokenStream ts = new CannedTokenStream(
|
||||
new Token("term1", 1, 2),
|
||||
new Token("term2", 2, 3, 4),
|
||||
new Token("term3", 0, 3, 4),
|
||||
new Token("term4", 0, 3, 4),
|
||||
new Token("term5", 5, 6)
|
||||
);
|
||||
|
||||
IntervalsSource source = BUILDER.analyzeText(new CachingTokenFilter(ts), -1, true);
|
||||
IntervalsSource expected = Intervals.ordered(
|
||||
Intervals.term("term1"),
|
||||
Intervals.extend(Intervals.or(Intervals.term("term2"), Intervals.term("term3"), Intervals.term("term4")), 1, 0),
|
||||
Intervals.term("term5")
|
||||
);
|
||||
assertEquals(expected, source);
|
||||
}
|
||||
|
||||
public void testGraphSynonyms() throws IOException {
|
||||
|
||||
// term1 term2/term3:2 term4 term5
|
||||
|
||||
Token graphToken = new Token("term2", 3, 4);
|
||||
graphToken.setPositionLength(2);
|
||||
// term1 term2:2/term3 term4 term5
|
||||
|
||||
CannedTokenStream ts = new CannedTokenStream(
|
||||
new Token("term1", 1, 2),
|
||||
graphToken,
|
||||
new Token("term2", 1, 3, 4, 2),
|
||||
new Token("term3", 0, 3, 4),
|
||||
new Token("term4", 5, 6),
|
||||
new Token("term5", 6, 7)
|
||||
|
@ -138,4 +170,50 @@ public class IntervalBuilderTests extends ESTestCase {
|
|||
|
||||
}
|
||||
|
||||
public void testGraphSynonymsWithGaps() throws IOException {
|
||||
|
||||
// term1 [] term2:4/term3 [] [] term4 term5
|
||||
|
||||
CannedTokenStream ts = new CannedTokenStream(
|
||||
new Token("term1", 1, 2),
|
||||
new Token("term2", 2, 3, 4, 4),
|
||||
new Token("term3", 0, 3, 4),
|
||||
new Token("term4", 3, 5, 6),
|
||||
new Token("term5", 6, 7)
|
||||
);
|
||||
|
||||
IntervalsSource source = BUILDER.analyzeText(new CachingTokenFilter(ts), -1, true);
|
||||
IntervalsSource expected = Intervals.ordered(
|
||||
Intervals.term("term1"),
|
||||
Intervals.or(
|
||||
Intervals.extend(Intervals.term("term2"), 1, 0),
|
||||
Intervals.phrase(
|
||||
Intervals.extend(Intervals.term("term3"), 1, 0),
|
||||
Intervals.extend(Intervals.term("term4"), 2, 0))),
|
||||
Intervals.term("term5")
|
||||
);
|
||||
|
||||
assertEquals(expected, source);
|
||||
|
||||
}
|
||||
|
||||
public void testGraphTerminatesOnGap() throws IOException {
|
||||
// term1 term2:2/term3 term4 [] term5
|
||||
CannedTokenStream ts = new CannedTokenStream(
|
||||
new Token("term1", 1, 2),
|
||||
new Token("term2", 1, 2, 3, 2),
|
||||
new Token("term3", 0, 2, 3),
|
||||
new Token("term4", 2, 3),
|
||||
new Token("term5", 2, 6, 7)
|
||||
);
|
||||
|
||||
IntervalsSource source = BUILDER.analyzeText(new CachingTokenFilter(ts), -1, true);
|
||||
IntervalsSource expected = Intervals.ordered(
|
||||
Intervals.term("term1"),
|
||||
Intervals.or(Intervals.term("term2"), Intervals.phrase("term3", "term4")),
|
||||
Intervals.extend(Intervals.term("term5"), 1, 0)
|
||||
);
|
||||
assertEquals(expected, source);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue