mirror of https://github.com/apache/lucene.git
LUCENE-5146: AnalyzingSuggester sort order doesn't respect the actual weight
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1508382 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
82a5f35c20
commit
1a84b76403
|
@ -95,6 +95,11 @@ Bug Fixes
|
||||||
* LUCENE-5132: Spatial RecursivePrefixTree Contains predicate will throw an NPE
|
* LUCENE-5132: Spatial RecursivePrefixTree Contains predicate will throw an NPE
|
||||||
when there's no indexed data and maybe in other circumstances too. (David Smiley)
|
when there's no indexed data and maybe in other circumstances too. (David Smiley)
|
||||||
|
|
||||||
|
* LUCENE-5146: AnalyzingSuggester sort comparator read part of the input key as the
|
||||||
|
weight that caused the sorter to never sort by weight first since the weight is only
|
||||||
|
considered if the input is equal causing the malformed weight to be identical as well.
|
||||||
|
(Simon Willnauer)
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
* LUCENE-5094: Add ramBytesUsed() to MultiDocValues.OrdinalMap.
|
* LUCENE-5094: Add ramBytesUsed() to MultiDocValues.OrdinalMap.
|
||||||
|
|
|
@ -349,11 +349,14 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
if (cmp != 0) {
|
if (cmp != 0) {
|
||||||
return cmp;
|
return cmp;
|
||||||
}
|
}
|
||||||
|
readerA.skipBytes(scratchA.length);
|
||||||
|
readerB.skipBytes(scratchB.length);
|
||||||
|
|
||||||
// Next by cost:
|
// Next by cost:
|
||||||
long aCost = readerA.readInt();
|
long aCost = readerA.readInt();
|
||||||
long bCost = readerB.readInt();
|
long bCost = readerB.readInt();
|
||||||
|
assert decodeWeight(aCost) >= 0;
|
||||||
|
assert decodeWeight(bCost) >= 0;
|
||||||
if (aCost < bCost) {
|
if (aCost < bCost) {
|
||||||
return -1;
|
return -1;
|
||||||
} else if (aCost > bCost) {
|
} else if (aCost > bCost) {
|
||||||
|
@ -362,25 +365,18 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
|
|
||||||
// Finally by surface form:
|
// Finally by surface form:
|
||||||
if (hasPayloads) {
|
if (hasPayloads) {
|
||||||
readerA.setPosition(readerA.getPosition() + scratchA.length);
|
|
||||||
scratchA.length = readerA.readShort();
|
scratchA.length = readerA.readShort();
|
||||||
scratchA.offset = readerA.getPosition();
|
|
||||||
readerB.setPosition(readerB.getPosition() + scratchB.length);
|
|
||||||
scratchB.length = readerB.readShort();
|
scratchB.length = readerB.readShort();
|
||||||
|
scratchA.offset = readerA.getPosition();
|
||||||
scratchB.offset = readerB.getPosition();
|
scratchB.offset = readerB.getPosition();
|
||||||
} else {
|
} else {
|
||||||
scratchA.offset = readerA.getPosition();
|
scratchA.offset = readerA.getPosition();
|
||||||
scratchA.length = a.length - scratchA.offset;
|
|
||||||
scratchB.offset = readerB.getPosition();
|
scratchB.offset = readerB.getPosition();
|
||||||
|
scratchA.length = a.length - scratchA.offset;
|
||||||
scratchB.length = b.length - scratchB.offset;
|
scratchB.length = b.length - scratchB.offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
cmp = scratchA.compareTo(scratchB);
|
return scratchA.compareTo(scratchB);
|
||||||
if (cmp != 0) {
|
|
||||||
return cmp;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -28,8 +28,11 @@ import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Random;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
|
|
||||||
|
@ -47,12 +50,14 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.search.suggest.Lookup.LookupResult;
|
import org.apache.lucene.search.suggest.Lookup.LookupResult;
|
||||||
import org.apache.lucene.search.suggest.TermFreq;
|
import org.apache.lucene.search.suggest.TermFreq;
|
||||||
import org.apache.lucene.search.suggest.TermFreqArrayIterator;
|
import org.apache.lucene.search.suggest.TermFreqArrayIterator;
|
||||||
import org.apache.lucene.search.suggest.TermFreqPayload;
|
import org.apache.lucene.search.suggest.TermFreqPayload;
|
||||||
import org.apache.lucene.search.suggest.TermFreqPayloadArrayIterator;
|
import org.apache.lucene.search.suggest.TermFreqPayloadArrayIterator;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.LineFileDocs;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
import org.apache.lucene.util._TestUtil;
|
import org.apache.lucene.util._TestUtil;
|
||||||
|
|
||||||
|
@ -60,13 +65,16 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
||||||
|
|
||||||
/** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */
|
/** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */
|
||||||
public void testKeyword() throws Exception {
|
public void testKeyword() throws Exception {
|
||||||
TermFreq keys[] = new TermFreq[] {
|
Iterable<TermFreq> keys = shuffle(
|
||||||
new TermFreq("foo", 50),
|
new TermFreq("foo", 50),
|
||||||
new TermFreq("bar", 10),
|
new TermFreq("bar", 10),
|
||||||
|
new TermFreq("barbar", 10),
|
||||||
new TermFreq("barbar", 12),
|
new TermFreq("barbar", 12),
|
||||||
new TermFreq("barbara", 6)
|
new TermFreq("barbara", 6),
|
||||||
};
|
new TermFreq("bar", 5),
|
||||||
|
new TermFreq("barbara", 1)
|
||||||
|
);
|
||||||
|
|
||||||
AnalyzingSuggester suggester = new AnalyzingSuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));
|
AnalyzingSuggester suggester = new AnalyzingSuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));
|
||||||
suggester.build(new TermFreqArrayIterator(keys));
|
suggester.build(new TermFreqArrayIterator(keys));
|
||||||
|
|
||||||
|
@ -103,12 +111,13 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testKeywordWithPayloads() throws Exception {
|
public void testKeywordWithPayloads() throws Exception {
|
||||||
TermFreqPayload keys[] = new TermFreqPayload[] {
|
Iterable<TermFreqPayload> keys = shuffle(
|
||||||
new TermFreqPayload("foo", 50, new BytesRef("hello")),
|
new TermFreqPayload("foo", 50, new BytesRef("hello")),
|
||||||
new TermFreqPayload("bar", 10, new BytesRef("goodbye")),
|
new TermFreqPayload("bar", 10, new BytesRef("goodbye")),
|
||||||
new TermFreqPayload("barbar", 12, new BytesRef("thank you")),
|
new TermFreqPayload("barbar", 12, new BytesRef("thank you")),
|
||||||
new TermFreqPayload("barbara", 6, new BytesRef("for all the fish"))
|
new TermFreqPayload("bar", 9, new BytesRef("should be deduplicated")),
|
||||||
};
|
new TermFreqPayload("bar", 8, new BytesRef("should also be deduplicated")),
|
||||||
|
new TermFreqPayload("barbara", 6, new BytesRef("for all the fish")));
|
||||||
|
|
||||||
AnalyzingSuggester suggester = new AnalyzingSuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));
|
AnalyzingSuggester suggester = new AnalyzingSuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));
|
||||||
suggester.build(new TermFreqPayloadArrayIterator(keys));
|
suggester.build(new TermFreqPayloadArrayIterator(keys));
|
||||||
|
@ -153,6 +162,50 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testRandomRealisticKeys() throws IOException {
|
||||||
|
LineFileDocs lineFile = new LineFileDocs(random());
|
||||||
|
Map<String, Long> mapping = new HashMap<>();
|
||||||
|
List<TermFreq> keys = new ArrayList<>();
|
||||||
|
|
||||||
|
int howMany = atLeast(100); // this might bring up duplicates
|
||||||
|
for (int i = 0; i < howMany; i++) {
|
||||||
|
Document nextDoc = lineFile.nextDoc();
|
||||||
|
String title = nextDoc.getField("title").stringValue();
|
||||||
|
int randomWeight = random().nextInt(100);
|
||||||
|
keys.add(new TermFreq(title, randomWeight));
|
||||||
|
if (!mapping.containsKey(title) || mapping.get(title) < randomWeight) {
|
||||||
|
mapping.put(title, Long.valueOf(randomWeight));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
AnalyzingSuggester analyzingSuggester = new AnalyzingSuggester(new MockAnalyzer(random()));
|
||||||
|
analyzingSuggester.setPreservePositionIncrements(random().nextBoolean());
|
||||||
|
boolean doPayloads = random().nextBoolean();
|
||||||
|
if (doPayloads) {
|
||||||
|
List<TermFreqPayload> keysAndPayloads = new ArrayList<>();
|
||||||
|
for (TermFreq termFreq : keys) {
|
||||||
|
keysAndPayloads.add(new TermFreqPayload(termFreq.term, termFreq.v, new BytesRef(Long.toString(termFreq.v))));
|
||||||
|
}
|
||||||
|
analyzingSuggester.build(new TermFreqPayloadArrayIterator(keysAndPayloads));
|
||||||
|
} else {
|
||||||
|
analyzingSuggester.build(new TermFreqArrayIterator(keys));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (TermFreq termFreq : keys) {
|
||||||
|
List<LookupResult> lookup = analyzingSuggester.lookup(termFreq.term.utf8ToString(), false, keys.size());
|
||||||
|
for (LookupResult lookupResult : lookup) {
|
||||||
|
assertEquals(mapping.get(lookupResult.key), Long.valueOf(lookupResult.value));
|
||||||
|
if (doPayloads) {
|
||||||
|
assertEquals(lookupResult.payload.utf8ToString(), Long.toString(lookupResult.value));
|
||||||
|
} else {
|
||||||
|
assertNull(lookupResult.payload);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
lineFile.close();
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: more tests
|
// TODO: more tests
|
||||||
/**
|
/**
|
||||||
* basic "standardanalyzer" test with stopword removal
|
* basic "standardanalyzer" test with stopword removal
|
||||||
|
@ -703,9 +756,9 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
||||||
AnalyzingSuggester suggester = new AnalyzingSuggester(a, a,
|
AnalyzingSuggester suggester = new AnalyzingSuggester(a, a,
|
||||||
preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1);
|
preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1);
|
||||||
if (doPayloads) {
|
if (doPayloads) {
|
||||||
suggester.build(new TermFreqPayloadArrayIterator(payloadKeys));
|
suggester.build(new TermFreqPayloadArrayIterator(shuffle(payloadKeys)));
|
||||||
} else {
|
} else {
|
||||||
suggester.build(new TermFreqArrayIterator(keys));
|
suggester.build(new TermFreqArrayIterator(shuffle(keys)));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (String prefix : allPrefixes) {
|
for (String prefix : allPrefixes) {
|
||||||
|
@ -823,15 +876,8 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
||||||
public void testMaxSurfaceFormsPerAnalyzedForm() throws Exception {
|
public void testMaxSurfaceFormsPerAnalyzedForm() throws Exception {
|
||||||
Analyzer a = new MockAnalyzer(random());
|
Analyzer a = new MockAnalyzer(random());
|
||||||
AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 2, -1);
|
AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 2, -1);
|
||||||
|
suggester.build(new TermFreqArrayIterator(shuffle(new TermFreq("a", 40),
|
||||||
List<TermFreq> keys = Arrays.asList(new TermFreq[] {
|
new TermFreq("a ", 50), new TermFreq(" a", 60))));
|
||||||
new TermFreq("a", 40),
|
|
||||||
new TermFreq("a ", 50),
|
|
||||||
new TermFreq(" a", 60),
|
|
||||||
});
|
|
||||||
|
|
||||||
Collections.shuffle(keys, random());
|
|
||||||
suggester.build(new TermFreqArrayIterator(keys));
|
|
||||||
|
|
||||||
List<LookupResult> results = suggester.lookup("a", false, 5);
|
List<LookupResult> results = suggester.lookup("a", false, 5);
|
||||||
assertEquals(2, results.size());
|
assertEquals(2, results.size());
|
||||||
|
@ -926,10 +972,9 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
||||||
|
|
||||||
AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 256, -1);
|
AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 256, -1);
|
||||||
|
|
||||||
suggester.build(new TermFreqArrayIterator(new TermFreq[] {
|
suggester.build(new TermFreqArrayIterator(shuffle(
|
||||||
new TermFreq("hambone", 6),
|
new TermFreq("hambone", 6),
|
||||||
new TermFreq("nellie", 5),
|
new TermFreq("nellie", 5))));
|
||||||
}));
|
|
||||||
|
|
||||||
List<LookupResult> results = suggester.lookup("nellie", false, 2);
|
List<LookupResult> results = suggester.lookup("nellie", false, 2);
|
||||||
assertEquals(2, results.size());
|
assertEquals(2, results.size());
|
||||||
|
@ -1147,4 +1192,14 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
||||||
// expected
|
// expected
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@SafeVarargs
|
||||||
|
public final <T> Iterable<T> shuffle(T...values) {
|
||||||
|
final List<T> asList = new ArrayList<T>(values.length);
|
||||||
|
for (T value : values) {
|
||||||
|
asList.add(value);
|
||||||
|
}
|
||||||
|
Collections.shuffle(asList, random());
|
||||||
|
return asList;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue