LUCENE-4534: dedup same surface form in Analyzing/FuzzySuggester

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1405977 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-11-05 21:54:24 +00:00
parent 2537836d4a
commit 37cdda05f5
2 changed files with 31 additions and 0 deletions

View File

@ -390,6 +390,7 @@ public class AnalyzingSuggester extends Lookup {
try {
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
BytesRef surfaceForm;
while ((surfaceForm = iterator.next()) != null) {
Set<IntsRef> paths = toFiniteStrings(surfaceForm, ts2a);
@ -430,6 +431,10 @@ public class AnalyzingSuggester extends Lookup {
// Sort all input/output pairs (required by FST.Builder):
new Sort(sortComparator).sort(tempInput, tempSorted);
// Free disk space:
tempInput.delete();
reader = new Sort.ByteSequencesReader(tempSorted);
PairOutputs<Long,BytesRef> outputs = new PairOutputs<Long,BytesRef>(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton());
@ -442,6 +447,12 @@ public class AnalyzingSuggester extends Lookup {
IntsRef scratchInts = new IntsRef();
ByteArrayDataInput input = new ByteArrayDataInput();
// Used to remove duplicate surface forms (but we
// still index the hightest-weight one). We clear
// this when we see a new analyzed form, so it cannot
// grow unbounded (at most 256 entries):
Set<BytesRef> seenSurfaceForms = new HashSet<BytesRef>();
int dedup = 0;
while (reader.read(scratch)) {
input.reset(scratch.bytes, scratch.offset, scratch.length);
@ -459,6 +470,7 @@ public class AnalyzingSuggester extends Lookup {
if (previousAnalyzed == null) {
previousAnalyzed = new BytesRef();
previousAnalyzed.copyBytes(analyzed);
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
} else if (analyzed.equals(previousAnalyzed)) {
dedup++;
if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
@ -466,9 +478,15 @@ public class AnalyzingSuggester extends Lookup {
// dups: skip the rest:
continue;
}
if (seenSurfaceForms.contains(surface)) {
continue;
}
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
} else {
dedup = 0;
previousAnalyzed.copyBytes(analyzed);
seenSurfaceForms.clear();
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
}
// TODO: I think we can avoid the extra 2 bytes when

View File

@ -1031,4 +1031,17 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
new TermFreq("a b", 50),
}));
}
public void testDupSurfaceFormsMissingResults3() throws Exception {
Analyzer a = new MockAnalyzer(random());
AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1);
suggester.build(new TermFreqArrayIterator(new TermFreq[] {
new TermFreq("a a", 7),
new TermFreq("a a", 7),
new TermFreq("a c", 6),
new TermFreq("a c", 3),
new TermFreq("a b", 5),
}));
assertEquals("[a a/7, a c/6, a b/5]", suggester.lookup("a", false, 3).toString());
}
}