LUCENE-2391: improve Spellchecker indexing speed

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1055285 13f79535-47bb-0310-9956-ffa450edef68
2011-01-05 03:16:56 +00:00 · 2011-01-05 03:16:56 +00:00 · c26fd7564a
parent 4061877e43
commit c26fd7564a
2 changed files with 89 additions and 21 deletions
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@ -78,6 +78,14 @@ Changes in backwards compatibility policy
 * LUCENE-2581: Added new methods to FragmentsBuilder interface. These methods
   are used to set pre/post tags and Encoder. (Koji Sekiguchi)
 * LUCENE-2391: Improved spellchecker (re)build time/ram usage by omitting 
   frequencies/positions/norms for single-valued fields, modifying the default
   ramBufferMBSize to match IndexWriterConfig (16MB), making index optimization
   an optional boolean parameter, and modifying the incremental update logic
   to work well with unoptimized spellcheck indexes. The indexDictionary() methods 
   were made final to ensure a hard backwards break in case you were subclassing 
   Spellchecker. In general, subclassing Spellchecker is not recommended.  (Robert Muir)
 Changes in runtime behavior
--- a/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java
+++ b/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java
@ -18,8 +18,10 @@ package org.apache.lucene.search.spell;
 */
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Comparator;
 import java.util.Iterator;
 import java.util.List;
 import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
 import org.apache.lucene.document.Document;
@ -30,6 +32,8 @@ import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.LogMergePolicy;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.IndexSearcher;
@ -38,7 +42,10 @@ import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.store.AlreadyClosedException;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.ReaderUtil;
 import org.apache.lucene.util.Version;
 import org.apache.lucene.util.VirtualMethod;
 /**
 * <p>
@ -492,35 +499,64 @@ public class SpellChecker implements java.io.Closeable {
   * @param dict Dictionary to index
   * @param mergeFactor mergeFactor to use when indexing
   * @param ramMB the max amount or memory in MB to use
   * @param optimize whether or not the spellcheck index should be optimized
   * @throws AlreadyClosedException if the Spellchecker is already closed
   * @throws IOException
   */
-  public void indexDictionary(Dictionary dict, int mergeFactor, int ramMB) throws IOException {
+  public final void indexDictionary(Dictionary dict, int mergeFactor, int ramMB, boolean optimize) throws IOException {
    synchronized (modifyCurrentIndexLock) {
      ensureOpen();
      final Directory dir = this.spellIndex;
      final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, new WhitespaceAnalyzer(Version.LUCENE_CURRENT)).setRAMBufferSizeMB(ramMB));
      ((LogMergePolicy) writer.getConfig().getMergePolicy()).setMergeFactor(mergeFactor);
      IndexSearcher indexSearcher = obtainSearcher();
      final List<TermsEnum> termsEnums = new ArrayList<TermsEnum>();
      if (searcher.maxDoc() > 0) {
        new ReaderUtil.Gather(searcher.getIndexReader()) {
          @Override
          protected void add(int base, IndexReader r) throws IOException {
            Terms terms = r.terms(F_WORD);
            if (terms != null)
              termsEnums.add(terms.iterator());
          }
        }.run();
      }
      boolean isEmpty = termsEnums.isEmpty();
      try { 
        Iterator<String> iter = dict.getWordsIterator();
        BytesRef currentTerm = new BytesRef();
        terms: while (iter.hasNext()) {
          String word = iter.next();
-      Iterator<String> iter = dict.getWordsIterator();
+          int len = word.length();
-      while (iter.hasNext()) {
+          if (len < 3) {
-        String word = iter.next();
+            continue; // too short we bail but "too long" is fine...
          }
-        int len = word.length();
+          if (!isEmpty) {
-        if (len < 3) {
+            // we have a non-empty index, check if the term exists
-          continue; // too short we bail but "too long" is fine...
+            currentTerm.copy(word);
            for (TermsEnum te : termsEnums) {
              if (te.seek(currentTerm, false) == TermsEnum.SeekStatus.FOUND) {
                continue terms;
              }
            }
          }
          // ok index the word
          Document doc = createDocument(word, getMin(len), getMax(len));
          writer.addDocument(doc);
        }
-  
+      } finally {
-        if (this.exist(word)) { // if the word already exist in the gramindex
+        releaseSearcher(indexSearcher);
          continue;
        }
        // ok index the word
        Document doc = createDocument(word, getMin(len), getMax(len));
        writer.addDocument(doc);
      }
      // close writer
-      writer.optimize();
+      if (optimize)
        writer.optimize();
      writer.close();
      // also re-open the spell index to see our own changes when the next suggestion
      // is fetched:
@ -531,10 +567,21 @@ public class SpellChecker implements java.io.Closeable {
  /**
   * Indexes the data from the given {@link Dictionary}.
   * @param dict the dictionary to index
   * @param mergeFactor mergeFactor to use when indexing
   * @param ramMB the max amount or memory in MB to use
   * @throws IOException
   */
-  public void indexDictionary(Dictionary dict) throws IOException {
+  public final void indexDictionary(Dictionary dict, int mergeFactor, int ramMB) throws IOException {
-    indexDictionary(dict, 300, 10);
+    indexDictionary(dict, mergeFactor, ramMB, true);
  }
  /**
   * Indexes the data from the given {@link Dictionary}.
   * @param dict the dictionary to index
   * @throws IOException
   */
  public final void indexDictionary(Dictionary dict) throws IOException {
    indexDictionary(dict, 300, (int)IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB);
  }
  private static int getMin(int l) {
@ -559,7 +606,12 @@ public class SpellChecker implements java.io.Closeable {
  private static Document createDocument(String text, int ng1, int ng2) {
    Document doc = new Document();
-    doc.add(new Field(F_WORD, text, Field.Store.YES, Field.Index.NOT_ANALYZED)); // orig term
+    // the word field is never queried on... its indexed so it can be quickly
    // checked for rebuild (and stored for retrieval). Doesn't need norms or TF/pos
    Field f = new Field(F_WORD, text, Field.Store.YES, Field.Index.NOT_ANALYZED);
    f.setOmitTermFreqAndPositions(true);
    f.setOmitNorms(true);
    doc.add(f); // orig term
    addGram(text, doc, ng1, ng2);
    return doc;
  }
@ -573,12 +625,20 @@ public class SpellChecker implements java.io.Closeable {
        String gram = text.substring(i, i + ng);
        doc.add(new Field(key, gram, Field.Store.NO, Field.Index.NOT_ANALYZED));
        if (i == 0) {
-          doc.add(new Field("start" + ng, gram, Field.Store.NO, Field.Index.NOT_ANALYZED));
+          // only one term possible in the startXXField, TF/pos and norms aren't needed.
          Field startField = new Field("start" + ng, gram, Field.Store.NO, Field.Index.NOT_ANALYZED);
          startField.setOmitTermFreqAndPositions(true);
          startField.setOmitNorms(true);
          doc.add(startField);
        }
        end = gram;
      }
      if (end != null) { // may not be present if len==ng1
-        doc.add(new Field("end" + ng, end, Field.Store.NO, Field.Index.NOT_ANALYZED));
+        // only one term possible in the endXXField, TF/pos and norms aren't needed.
        Field endField = new Field("end" + ng, end, Field.Store.NO, Field.Index.NOT_ANALYZED);
        endField.setOmitTermFreqAndPositions(true);
        endField.setOmitNorms(true);
        doc.add(endField);
      }
    }
  }