LUCENE-5146: AnalyzingSuggester sort order doesn't respect the actual weight

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1508382 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2013-07-30 10:45:38 +00:00
parent 82a5f35c20
commit 1a84b76403
3 changed files with 89 additions and 33 deletions

View File

@ -95,6 +95,11 @@ Bug Fixes
* LUCENE-5132: Spatial RecursivePrefixTree Contains predicate will throw an NPE * LUCENE-5132: Spatial RecursivePrefixTree Contains predicate will throw an NPE
when there's no indexed data and maybe in other circumstances too. (David Smiley) when there's no indexed data and maybe in other circumstances too. (David Smiley)
* LUCENE-5146: AnalyzingSuggester sort comparator read part of the input key as the
weight that caused the sorter to never sort by weight first since the weight is only
considered if the input is equal causing the malformed weight to be identical as well.
(Simon Willnauer)
API Changes API Changes
* LUCENE-5094: Add ramBytesUsed() to MultiDocValues.OrdinalMap. * LUCENE-5094: Add ramBytesUsed() to MultiDocValues.OrdinalMap.

View File

@ -349,11 +349,14 @@ public class AnalyzingSuggester extends Lookup {
if (cmp != 0) { if (cmp != 0) {
return cmp; return cmp;
} }
readerA.skipBytes(scratchA.length);
readerB.skipBytes(scratchB.length);
// Next by cost: // Next by cost:
long aCost = readerA.readInt(); long aCost = readerA.readInt();
long bCost = readerB.readInt(); long bCost = readerB.readInt();
assert decodeWeight(aCost) >= 0;
assert decodeWeight(bCost) >= 0;
if (aCost < bCost) { if (aCost < bCost) {
return -1; return -1;
} else if (aCost > bCost) { } else if (aCost > bCost) {
@ -362,25 +365,18 @@ public class AnalyzingSuggester extends Lookup {
// Finally by surface form: // Finally by surface form:
if (hasPayloads) { if (hasPayloads) {
readerA.setPosition(readerA.getPosition() + scratchA.length);
scratchA.length = readerA.readShort(); scratchA.length = readerA.readShort();
scratchA.offset = readerA.getPosition();
readerB.setPosition(readerB.getPosition() + scratchB.length);
scratchB.length = readerB.readShort(); scratchB.length = readerB.readShort();
scratchA.offset = readerA.getPosition();
scratchB.offset = readerB.getPosition(); scratchB.offset = readerB.getPosition();
} else { } else {
scratchA.offset = readerA.getPosition(); scratchA.offset = readerA.getPosition();
scratchA.length = a.length - scratchA.offset;
scratchB.offset = readerB.getPosition(); scratchB.offset = readerB.getPosition();
scratchA.length = a.length - scratchA.offset;
scratchB.length = b.length - scratchB.offset; scratchB.length = b.length - scratchB.offset;
} }
cmp = scratchA.compareTo(scratchB); return scratchA.compareTo(scratchB);
if (cmp != 0) {
return cmp;
}
return 0;
} }
} }

View File

@ -28,8 +28,11 @@ import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
import java.util.Comparator; import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set; import java.util.Set;
import java.util.TreeSet; import java.util.TreeSet;
@ -47,12 +50,14 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.suggest.Lookup.LookupResult; import org.apache.lucene.search.suggest.Lookup.LookupResult;
import org.apache.lucene.search.suggest.TermFreq; import org.apache.lucene.search.suggest.TermFreq;
import org.apache.lucene.search.suggest.TermFreqArrayIterator; import org.apache.lucene.search.suggest.TermFreqArrayIterator;
import org.apache.lucene.search.suggest.TermFreqPayload; import org.apache.lucene.search.suggest.TermFreqPayload;
import org.apache.lucene.search.suggest.TermFreqPayloadArrayIterator; import org.apache.lucene.search.suggest.TermFreqPayloadArrayIterator;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LineFileDocs;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil; import org.apache.lucene.util._TestUtil;
@ -60,12 +65,15 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
/** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */ /** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */
public void testKeyword() throws Exception { public void testKeyword() throws Exception {
TermFreq keys[] = new TermFreq[] { Iterable<TermFreq> keys = shuffle(
new TermFreq("foo", 50), new TermFreq("foo", 50),
new TermFreq("bar", 10), new TermFreq("bar", 10),
new TermFreq("barbar", 10),
new TermFreq("barbar", 12), new TermFreq("barbar", 12),
new TermFreq("barbara", 6) new TermFreq("barbara", 6),
}; new TermFreq("bar", 5),
new TermFreq("barbara", 1)
);
AnalyzingSuggester suggester = new AnalyzingSuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)); AnalyzingSuggester suggester = new AnalyzingSuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));
suggester.build(new TermFreqArrayIterator(keys)); suggester.build(new TermFreqArrayIterator(keys));
@ -103,12 +111,13 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
} }
public void testKeywordWithPayloads() throws Exception { public void testKeywordWithPayloads() throws Exception {
TermFreqPayload keys[] = new TermFreqPayload[] { Iterable<TermFreqPayload> keys = shuffle(
new TermFreqPayload("foo", 50, new BytesRef("hello")), new TermFreqPayload("foo", 50, new BytesRef("hello")),
new TermFreqPayload("bar", 10, new BytesRef("goodbye")), new TermFreqPayload("bar", 10, new BytesRef("goodbye")),
new TermFreqPayload("barbar", 12, new BytesRef("thank you")), new TermFreqPayload("barbar", 12, new BytesRef("thank you")),
new TermFreqPayload("barbara", 6, new BytesRef("for all the fish")) new TermFreqPayload("bar", 9, new BytesRef("should be deduplicated")),
}; new TermFreqPayload("bar", 8, new BytesRef("should also be deduplicated")),
new TermFreqPayload("barbara", 6, new BytesRef("for all the fish")));
AnalyzingSuggester suggester = new AnalyzingSuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)); AnalyzingSuggester suggester = new AnalyzingSuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));
suggester.build(new TermFreqPayloadArrayIterator(keys)); suggester.build(new TermFreqPayloadArrayIterator(keys));
@ -153,6 +162,50 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
} }
} }
public void testRandomRealisticKeys() throws IOException {
LineFileDocs lineFile = new LineFileDocs(random());
Map<String, Long> mapping = new HashMap<>();
List<TermFreq> keys = new ArrayList<>();
int howMany = atLeast(100); // this might bring up duplicates
for (int i = 0; i < howMany; i++) {
Document nextDoc = lineFile.nextDoc();
String title = nextDoc.getField("title").stringValue();
int randomWeight = random().nextInt(100);
keys.add(new TermFreq(title, randomWeight));
if (!mapping.containsKey(title) || mapping.get(title) < randomWeight) {
mapping.put(title, Long.valueOf(randomWeight));
}
}
AnalyzingSuggester analyzingSuggester = new AnalyzingSuggester(new MockAnalyzer(random()));
analyzingSuggester.setPreservePositionIncrements(random().nextBoolean());
boolean doPayloads = random().nextBoolean();
if (doPayloads) {
List<TermFreqPayload> keysAndPayloads = new ArrayList<>();
for (TermFreq termFreq : keys) {
keysAndPayloads.add(new TermFreqPayload(termFreq.term, termFreq.v, new BytesRef(Long.toString(termFreq.v))));
}
analyzingSuggester.build(new TermFreqPayloadArrayIterator(keysAndPayloads));
} else {
analyzingSuggester.build(new TermFreqArrayIterator(keys));
}
for (TermFreq termFreq : keys) {
List<LookupResult> lookup = analyzingSuggester.lookup(termFreq.term.utf8ToString(), false, keys.size());
for (LookupResult lookupResult : lookup) {
assertEquals(mapping.get(lookupResult.key), Long.valueOf(lookupResult.value));
if (doPayloads) {
assertEquals(lookupResult.payload.utf8ToString(), Long.toString(lookupResult.value));
} else {
assertNull(lookupResult.payload);
}
}
}
lineFile.close();
}
// TODO: more tests // TODO: more tests
/** /**
* basic "standardanalyzer" test with stopword removal * basic "standardanalyzer" test with stopword removal
@ -703,9 +756,9 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester suggester = new AnalyzingSuggester(a, a,
preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1); preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1);
if (doPayloads) { if (doPayloads) {
suggester.build(new TermFreqPayloadArrayIterator(payloadKeys)); suggester.build(new TermFreqPayloadArrayIterator(shuffle(payloadKeys)));
} else { } else {
suggester.build(new TermFreqArrayIterator(keys)); suggester.build(new TermFreqArrayIterator(shuffle(keys)));
} }
for (String prefix : allPrefixes) { for (String prefix : allPrefixes) {
@ -823,15 +876,8 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
public void testMaxSurfaceFormsPerAnalyzedForm() throws Exception { public void testMaxSurfaceFormsPerAnalyzedForm() throws Exception {
Analyzer a = new MockAnalyzer(random()); Analyzer a = new MockAnalyzer(random());
AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 2, -1); AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 2, -1);
suggester.build(new TermFreqArrayIterator(shuffle(new TermFreq("a", 40),
List<TermFreq> keys = Arrays.asList(new TermFreq[] { new TermFreq("a ", 50), new TermFreq(" a", 60))));
new TermFreq("a", 40),
new TermFreq("a ", 50),
new TermFreq(" a", 60),
});
Collections.shuffle(keys, random());
suggester.build(new TermFreqArrayIterator(keys));
List<LookupResult> results = suggester.lookup("a", false, 5); List<LookupResult> results = suggester.lookup("a", false, 5);
assertEquals(2, results.size()); assertEquals(2, results.size());
@ -926,10 +972,9 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 256, -1); AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 256, -1);
suggester.build(new TermFreqArrayIterator(new TermFreq[] { suggester.build(new TermFreqArrayIterator(shuffle(
new TermFreq("hambone", 6), new TermFreq("hambone", 6),
new TermFreq("nellie", 5), new TermFreq("nellie", 5))));
}));
List<LookupResult> results = suggester.lookup("nellie", false, 2); List<LookupResult> results = suggester.lookup("nellie", false, 2);
assertEquals(2, results.size()); assertEquals(2, results.size());
@ -1147,4 +1192,14 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
// expected // expected
} }
} }
@SafeVarargs
public final <T> Iterable<T> shuffle(T...values) {
final List<T> asList = new ArrayList<T>(values.length);
for (T value : values) {
asList.add(value);
}
Collections.shuffle(asList, random());
return asList;
}
} }