LUCENE-4534: dedup same surface form in Analyzing/FuzzySuggester

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1405977 13f79535-47bb-0310-9956-ffa450edef68
2025-03-03 06:49:38 +00:00 · 2012-11-05 21:54:24 +00:00 · 2012-11-05 21:54:24 +00:00 · 37cdda05f5
commit 37cdda05f5
parent 2537836d4a
2 changed files with 31 additions and 0 deletions
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
@ -390,6 +390,7 @@ public class AnalyzingSuggester extends Lookup {
    try {
      ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
      BytesRef surfaceForm;
+
      while ((surfaceForm = iterator.next()) != null) {
        Set<IntsRef> paths = toFiniteStrings(surfaceForm, ts2a);
        
@ -430,6 +431,10 @@ public class AnalyzingSuggester extends Lookup {

      // Sort all input/output pairs (required by FST.Builder):
      new Sort(sortComparator).sort(tempInput, tempSorted);
+
+      // Free disk space:
+      tempInput.delete();
+
      reader = new Sort.ByteSequencesReader(tempSorted);
     
      PairOutputs<Long,BytesRef> outputs = new PairOutputs<Long,BytesRef>(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton());
@ -442,6 +447,12 @@ public class AnalyzingSuggester extends Lookup {
      IntsRef scratchInts = new IntsRef();
      ByteArrayDataInput input = new ByteArrayDataInput();

+      // Used to remove duplicate surface forms (but we
+      // still index the hightest-weight one).  We clear
+      // this when we see a new analyzed form, so it cannot
+      // grow unbounded (at most 256 entries):
+      Set<BytesRef> seenSurfaceForms = new HashSet<BytesRef>();
+
      int dedup = 0;
      while (reader.read(scratch)) {
        input.reset(scratch.bytes, scratch.offset, scratch.length);
@ -459,6 +470,7 @@ public class AnalyzingSuggester extends Lookup {
        if (previousAnalyzed == null) {
          previousAnalyzed = new BytesRef();
          previousAnalyzed.copyBytes(analyzed);
+          seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
        } else if (analyzed.equals(previousAnalyzed)) {
          dedup++;
          if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
@ -466,9 +478,15 @@ public class AnalyzingSuggester extends Lookup {
            // dups: skip the rest:
            continue;
          }
+          if (seenSurfaceForms.contains(surface)) {
+            continue;
+          }
+          seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
        } else {
          dedup = 0;
          previousAnalyzed.copyBytes(analyzed);
+          seenSurfaceForms.clear();
+          seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
        }

        // TODO: I think we can avoid the extra 2 bytes when
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java
@ -1031,4 +1031,17 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
          new TermFreq("a b", 50),
        }));
  }
+
+  public void testDupSurfaceFormsMissingResults3() throws Exception {
+    Analyzer a = new MockAnalyzer(random());
+    AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1);
+    suggester.build(new TermFreqArrayIterator(new TermFreq[] {
+          new TermFreq("a a", 7),
+          new TermFreq("a a", 7),
+          new TermFreq("a c", 6),
+          new TermFreq("a c", 3),
+          new TermFreq("a b", 5),
+        }));
+    assertEquals("[a a/7, a c/6, a b/5]", suggester.lookup("a", false, 3).toString());
+  }
 }