mirror of https://github.com/apache/lucene.git
LUCENE-7824: Fix graph query analysis for multi-word synonym rules with common terms (eg. new york, new york city).
This commit is contained in:
parent
e11bc03098
commit
21362a3ba4
|
@ -131,6 +131,9 @@ Bug Fixes
|
|||
"lucene"/standard query parser, should require " TO " in range queries,
|
||||
and accept "TO" as endpoints in range queries. (hossman, Steve Rowe)
|
||||
|
||||
* LUCENE-7824: Fix graph query analysis for multi-word synonym rules with common terms (eg. new york, new york city).
|
||||
(Jim Ferenczi)
|
||||
|
||||
Improvements
|
||||
|
||||
* LUCENE-7782: OfflineSorter now passes the total number of items it
|
||||
|
|
|
@ -48,7 +48,6 @@ import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZ
|
|||
* This class also provides helpers to explore the different paths of the {@link Automaton}.
|
||||
*/
|
||||
public final class GraphTokenStreamFiniteStrings {
|
||||
private final Map<BytesRef, Integer> termToID = new HashMap<>();
|
||||
private final Map<Integer, BytesRef> idToTerm = new HashMap<>();
|
||||
private final Map<Integer, Integer> idToInc = new HashMap<>();
|
||||
private final Automaton det;
|
||||
|
@ -247,35 +246,18 @@ public final class GraphTokenStreamFiniteStrings {
|
|||
}
|
||||
|
||||
/**
|
||||
* Gets an integer id for a given term.
|
||||
*
|
||||
* If there is no position gaps for this token then we can reuse the id for the same term if it appeared at another
|
||||
* position without a gap. If we have a position gap generate a new id so we can keep track of the position
|
||||
* increment.
|
||||
* Gets an integer id for a given term and saves the position increment if needed.
|
||||
*/
|
||||
private int getTermID(int incr, int prevIncr, BytesRef term) {
|
||||
assert term != null;
|
||||
boolean isStackedGap = incr == 0 && prevIncr > 1;
|
||||
boolean hasGap = incr > 1;
|
||||
Integer id;
|
||||
if (hasGap || isStackedGap) {
|
||||
id = idToTerm.size();
|
||||
idToTerm.put(id, BytesRef.deepCopyOf(term));
|
||||
|
||||
// stacked token should have the same increment as original token at this position
|
||||
if (isStackedGap) {
|
||||
idToInc.put(id, prevIncr);
|
||||
} else {
|
||||
idToInc.put(id, incr);
|
||||
}
|
||||
} else {
|
||||
id = termToID.get(term);
|
||||
if (id == null) {
|
||||
term = BytesRef.deepCopyOf(term);
|
||||
id = idToTerm.size();
|
||||
termToID.put(term, id);
|
||||
idToTerm.put(id, term);
|
||||
}
|
||||
int id = idToTerm.size();
|
||||
idToTerm.put(id, BytesRef.deepCopyOf(term));
|
||||
// stacked token should have the same increment as original token at this position
|
||||
if (isStackedGap) {
|
||||
idToInc.put(id, prevIncr);
|
||||
} else if (incr > 1) {
|
||||
idToInc.put(id, incr);
|
||||
}
|
||||
return id;
|
||||
}
|
||||
|
|
|
@ -378,6 +378,59 @@ public class TestGraphTokenStreamFiniteStrings extends LuceneTestCase {
|
|||
assertArrayEquals(terms, new Term[] {new Term("field", "network")});
|
||||
}
|
||||
|
||||
public void testStackedGraphWithRepeat() throws Exception {
|
||||
TokenStream ts = new CannedTokenStream(
|
||||
token("ny", 1, 4),
|
||||
token("new", 0, 1),
|
||||
token("new", 0, 3),
|
||||
token("york", 1, 1),
|
||||
token("city", 1, 2),
|
||||
token("york", 1, 1),
|
||||
token("is", 1, 1),
|
||||
token("great", 1, 1)
|
||||
);
|
||||
|
||||
GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(ts);
|
||||
|
||||
Iterator<TokenStream> it = graph.getFiniteStrings();
|
||||
assertTrue(it.hasNext());
|
||||
assertTokenStream(it.next(), new String[]{"ny", "is", "great"}, new int[]{1, 1, 1});
|
||||
assertTrue(it.hasNext());
|
||||
assertTokenStream(it.next(), new String[]{"new", "york", "city", "is", "great"}, new int[]{1, 1, 1, 1, 1});
|
||||
assertTrue(it.hasNext());
|
||||
assertTokenStream(it.next(), new String[]{"new", "york", "is", "great"}, new int[]{1, 1, 1, 1});
|
||||
assertFalse(it.hasNext());
|
||||
|
||||
int[] points = graph.articulationPoints();
|
||||
assertArrayEquals(points, new int[] {4, 5});
|
||||
|
||||
assertTrue(graph.hasSidePath(0));
|
||||
it = graph.getFiniteStrings(0, 4);
|
||||
assertTrue(it.hasNext());
|
||||
assertTokenStream(it.next(), new String[]{"ny"}, new int[]{1});
|
||||
assertTrue(it.hasNext());
|
||||
assertTokenStream(it.next(), new String[]{"new", "york", "city"}, new int[]{1, 1, 1});
|
||||
assertTrue(it.hasNext());
|
||||
assertTokenStream(it.next(), new String[]{"new", "york"}, new int[]{1, 1});
|
||||
assertFalse(it.hasNext());
|
||||
|
||||
assertFalse(graph.hasSidePath(4));
|
||||
it = graph.getFiniteStrings(4, 5);
|
||||
assertTrue(it.hasNext());
|
||||
assertTokenStream(it.next(), new String[]{"is"}, new int[] {1});
|
||||
assertFalse(it.hasNext());
|
||||
Term[] terms = graph.getTerms("field", 4);
|
||||
assertArrayEquals(terms, new Term[] {new Term("field", "is")});
|
||||
|
||||
assertFalse(graph.hasSidePath(5));
|
||||
it = graph.getFiniteStrings(5, -1);
|
||||
assertTrue(it.hasNext());
|
||||
assertTokenStream(it.next(), new String[]{"great"}, new int[] {1});
|
||||
assertFalse(it.hasNext());
|
||||
terms = graph.getTerms("field", 5);
|
||||
assertArrayEquals(terms, new Term[] {new Term("field", "great")});
|
||||
}
|
||||
|
||||
public void testGraphWithRegularSynonym() throws Exception {
|
||||
TokenStream ts = new CannedTokenStream(
|
||||
token("fast", 1, 1),
|
||||
|
|
Loading…
Reference in New Issue