mirror of
https://github.com/apache/lucene.git
synced 2025-03-03 06:49:38 +00:00
LUCENE-4534: dedup same surface form in Analyzing/FuzzySuggester
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1405977 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2537836d4a
commit
37cdda05f5
@ -390,6 +390,7 @@ public class AnalyzingSuggester extends Lookup {
|
||||
try {
|
||||
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
|
||||
BytesRef surfaceForm;
|
||||
|
||||
while ((surfaceForm = iterator.next()) != null) {
|
||||
Set<IntsRef> paths = toFiniteStrings(surfaceForm, ts2a);
|
||||
|
||||
@ -430,6 +431,10 @@ public class AnalyzingSuggester extends Lookup {
|
||||
|
||||
// Sort all input/output pairs (required by FST.Builder):
|
||||
new Sort(sortComparator).sort(tempInput, tempSorted);
|
||||
|
||||
// Free disk space:
|
||||
tempInput.delete();
|
||||
|
||||
reader = new Sort.ByteSequencesReader(tempSorted);
|
||||
|
||||
PairOutputs<Long,BytesRef> outputs = new PairOutputs<Long,BytesRef>(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton());
|
||||
@ -442,6 +447,12 @@ public class AnalyzingSuggester extends Lookup {
|
||||
IntsRef scratchInts = new IntsRef();
|
||||
ByteArrayDataInput input = new ByteArrayDataInput();
|
||||
|
||||
// Used to remove duplicate surface forms (but we
|
||||
// still index the hightest-weight one). We clear
|
||||
// this when we see a new analyzed form, so it cannot
|
||||
// grow unbounded (at most 256 entries):
|
||||
Set<BytesRef> seenSurfaceForms = new HashSet<BytesRef>();
|
||||
|
||||
int dedup = 0;
|
||||
while (reader.read(scratch)) {
|
||||
input.reset(scratch.bytes, scratch.offset, scratch.length);
|
||||
@ -459,6 +470,7 @@ public class AnalyzingSuggester extends Lookup {
|
||||
if (previousAnalyzed == null) {
|
||||
previousAnalyzed = new BytesRef();
|
||||
previousAnalyzed.copyBytes(analyzed);
|
||||
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
|
||||
} else if (analyzed.equals(previousAnalyzed)) {
|
||||
dedup++;
|
||||
if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
|
||||
@ -466,9 +478,15 @@ public class AnalyzingSuggester extends Lookup {
|
||||
// dups: skip the rest:
|
||||
continue;
|
||||
}
|
||||
if (seenSurfaceForms.contains(surface)) {
|
||||
continue;
|
||||
}
|
||||
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
|
||||
} else {
|
||||
dedup = 0;
|
||||
previousAnalyzed.copyBytes(analyzed);
|
||||
seenSurfaceForms.clear();
|
||||
seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
|
||||
}
|
||||
|
||||
// TODO: I think we can avoid the extra 2 bytes when
|
||||
|
@ -1031,4 +1031,17 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
||||
new TermFreq("a b", 50),
|
||||
}));
|
||||
}
|
||||
|
||||
public void testDupSurfaceFormsMissingResults3() throws Exception {
|
||||
Analyzer a = new MockAnalyzer(random());
|
||||
AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1);
|
||||
suggester.build(new TermFreqArrayIterator(new TermFreq[] {
|
||||
new TermFreq("a a", 7),
|
||||
new TermFreq("a a", 7),
|
||||
new TermFreq("a c", 6),
|
||||
new TermFreq("a c", 3),
|
||||
new TermFreq("a b", 5),
|
||||
}));
|
||||
assertEquals("[a a/7, a c/6, a b/5]", suggester.lookup("a", false, 3).toString());
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user