From 1a84b76403ed68771884f2cbab078d92efa96195 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Tue, 30 Jul 2013 10:45:38 +0000 Subject: [PATCH] LUCENE-5146: AnalyzingSuggester sort order doesn't respect the actual weight git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1508382 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 5 + .../suggest/analyzing/AnalyzingSuggester.java | 20 ++-- .../analyzing/AnalyzingSuggesterTest.java | 97 +++++++++++++++---- 3 files changed, 89 insertions(+), 33 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 60118c53e29..242dbaa042f 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -95,6 +95,11 @@ Bug Fixes * LUCENE-5132: Spatial RecursivePrefixTree Contains predicate will throw an NPE when there's no indexed data and maybe in other circumstances too. (David Smiley) +* LUCENE-5146: AnalyzingSuggester sort comparator read part of the input key as the + weight that caused the sorter to never sort by weight first since the weight is only + considered if the input is equal causing the malformed weight to be identical as well. + (Simon Willnauer) + API Changes * LUCENE-5094: Add ramBytesUsed() to MultiDocValues.OrdinalMap. diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java index 7104a9f024c..ba64403d2c3 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java @@ -349,11 +349,14 @@ public class AnalyzingSuggester extends Lookup { if (cmp != 0) { return cmp; } + readerA.skipBytes(scratchA.length); + readerB.skipBytes(scratchB.length); // Next by cost: long aCost = readerA.readInt(); long bCost = readerB.readInt(); - + assert decodeWeight(aCost) >= 0; + assert decodeWeight(bCost) >= 0; if (aCost < bCost) { return -1; } else if (aCost > bCost) { @@ -362,25 +365,18 @@ public class AnalyzingSuggester extends Lookup { // Finally by surface form: if (hasPayloads) { - readerA.setPosition(readerA.getPosition() + scratchA.length); scratchA.length = readerA.readShort(); - scratchA.offset = readerA.getPosition(); - readerB.setPosition(readerB.getPosition() + scratchB.length); scratchB.length = readerB.readShort(); + scratchA.offset = readerA.getPosition(); scratchB.offset = readerB.getPosition(); } else { scratchA.offset = readerA.getPosition(); - scratchA.length = a.length - scratchA.offset; scratchB.offset = readerB.getPosition(); + scratchA.length = a.length - scratchA.offset; scratchB.length = b.length - scratchB.offset; } - - cmp = scratchA.compareTo(scratchB); - if (cmp != 0) { - return cmp; - } - - return 0; + + return scratchA.compareTo(scratchB); } } diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java index e6f94a55069..995f60daa31 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java @@ -28,8 +28,11 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; +import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Map; +import java.util.Random; import java.util.Set; import java.util.TreeSet; @@ -47,12 +50,14 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.document.Document; import org.apache.lucene.search.suggest.Lookup.LookupResult; import org.apache.lucene.search.suggest.TermFreq; import org.apache.lucene.search.suggest.TermFreqArrayIterator; import org.apache.lucene.search.suggest.TermFreqPayload; import org.apache.lucene.search.suggest.TermFreqPayloadArrayIterator; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LineFileDocs; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; @@ -60,13 +65,16 @@ public class AnalyzingSuggesterTest extends LuceneTestCase { /** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */ public void testKeyword() throws Exception { - TermFreq keys[] = new TermFreq[] { + Iterable keys = shuffle( new TermFreq("foo", 50), new TermFreq("bar", 10), + new TermFreq("barbar", 10), new TermFreq("barbar", 12), - new TermFreq("barbara", 6) - }; - + new TermFreq("barbara", 6), + new TermFreq("bar", 5), + new TermFreq("barbara", 1) + ); + AnalyzingSuggester suggester = new AnalyzingSuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)); suggester.build(new TermFreqArrayIterator(keys)); @@ -103,12 +111,13 @@ public class AnalyzingSuggesterTest extends LuceneTestCase { } public void testKeywordWithPayloads() throws Exception { - TermFreqPayload keys[] = new TermFreqPayload[] { + Iterable keys = shuffle( new TermFreqPayload("foo", 50, new BytesRef("hello")), new TermFreqPayload("bar", 10, new BytesRef("goodbye")), new TermFreqPayload("barbar", 12, new BytesRef("thank you")), - new TermFreqPayload("barbara", 6, new BytesRef("for all the fish")) - }; + new TermFreqPayload("bar", 9, new BytesRef("should be deduplicated")), + new TermFreqPayload("bar", 8, new BytesRef("should also be deduplicated")), + new TermFreqPayload("barbara", 6, new BytesRef("for all the fish"))); AnalyzingSuggester suggester = new AnalyzingSuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)); suggester.build(new TermFreqPayloadArrayIterator(keys)); @@ -153,6 +162,50 @@ public class AnalyzingSuggesterTest extends LuceneTestCase { } } + public void testRandomRealisticKeys() throws IOException { + LineFileDocs lineFile = new LineFileDocs(random()); + Map mapping = new HashMap<>(); + List keys = new ArrayList<>(); + + int howMany = atLeast(100); // this might bring up duplicates + for (int i = 0; i < howMany; i++) { + Document nextDoc = lineFile.nextDoc(); + String title = nextDoc.getField("title").stringValue(); + int randomWeight = random().nextInt(100); + keys.add(new TermFreq(title, randomWeight)); + if (!mapping.containsKey(title) || mapping.get(title) < randomWeight) { + mapping.put(title, Long.valueOf(randomWeight)); + } + } + + AnalyzingSuggester analyzingSuggester = new AnalyzingSuggester(new MockAnalyzer(random())); + analyzingSuggester.setPreservePositionIncrements(random().nextBoolean()); + boolean doPayloads = random().nextBoolean(); + if (doPayloads) { + List keysAndPayloads = new ArrayList<>(); + for (TermFreq termFreq : keys) { + keysAndPayloads.add(new TermFreqPayload(termFreq.term, termFreq.v, new BytesRef(Long.toString(termFreq.v)))); + } + analyzingSuggester.build(new TermFreqPayloadArrayIterator(keysAndPayloads)); + } else { + analyzingSuggester.build(new TermFreqArrayIterator(keys)); + } + + for (TermFreq termFreq : keys) { + List lookup = analyzingSuggester.lookup(termFreq.term.utf8ToString(), false, keys.size()); + for (LookupResult lookupResult : lookup) { + assertEquals(mapping.get(lookupResult.key), Long.valueOf(lookupResult.value)); + if (doPayloads) { + assertEquals(lookupResult.payload.utf8ToString(), Long.toString(lookupResult.value)); + } else { + assertNull(lookupResult.payload); + } + } + } + + lineFile.close(); + } + // TODO: more tests /** * basic "standardanalyzer" test with stopword removal @@ -703,9 +756,9 @@ public class AnalyzingSuggesterTest extends LuceneTestCase { AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, preserveSep ? AnalyzingSuggester.PRESERVE_SEP : 0, 256, -1); if (doPayloads) { - suggester.build(new TermFreqPayloadArrayIterator(payloadKeys)); + suggester.build(new TermFreqPayloadArrayIterator(shuffle(payloadKeys))); } else { - suggester.build(new TermFreqArrayIterator(keys)); + suggester.build(new TermFreqArrayIterator(shuffle(keys))); } for (String prefix : allPrefixes) { @@ -823,15 +876,8 @@ public class AnalyzingSuggesterTest extends LuceneTestCase { public void testMaxSurfaceFormsPerAnalyzedForm() throws Exception { Analyzer a = new MockAnalyzer(random()); AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 2, -1); - - List keys = Arrays.asList(new TermFreq[] { - new TermFreq("a", 40), - new TermFreq("a ", 50), - new TermFreq(" a", 60), - }); - - Collections.shuffle(keys, random()); - suggester.build(new TermFreqArrayIterator(keys)); + suggester.build(new TermFreqArrayIterator(shuffle(new TermFreq("a", 40), + new TermFreq("a ", 50), new TermFreq(" a", 60)))); List results = suggester.lookup("a", false, 5); assertEquals(2, results.size()); @@ -926,10 +972,9 @@ public class AnalyzingSuggesterTest extends LuceneTestCase { AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, 0, 256, -1); - suggester.build(new TermFreqArrayIterator(new TermFreq[] { + suggester.build(new TermFreqArrayIterator(shuffle( new TermFreq("hambone", 6), - new TermFreq("nellie", 5), - })); + new TermFreq("nellie", 5)))); List results = suggester.lookup("nellie", false, 2); assertEquals(2, results.size()); @@ -1147,4 +1192,14 @@ public class AnalyzingSuggesterTest extends LuceneTestCase { // expected } } + + @SafeVarargs + public final Iterable shuffle(T...values) { + final List asList = new ArrayList(values.length); + for (T value : values) { + asList.add(value); + } + Collections.shuffle(asList, random()); + return asList; + } }