diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt
index 31a532fd621..24eed06507a 100644
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@@ -78,6 +78,14 @@ Changes in backwards compatibility policy
* LUCENE-2581: Added new methods to FragmentsBuilder interface. These methods
are used to set pre/post tags and Encoder. (Koji Sekiguchi)
+
+ * LUCENE-2391: Improved spellchecker (re)build time/ram usage by omitting
+ frequencies/positions/norms for single-valued fields, modifying the default
+ ramBufferMBSize to match IndexWriterConfig (16MB), making index optimization
+ an optional boolean parameter, and modifying the incremental update logic
+ to work well with unoptimized spellcheck indexes. The indexDictionary() methods
+ were made final to ensure a hard backwards break in case you were subclassing
+ Spellchecker. In general, subclassing Spellchecker is not recommended. (Robert Muir)
Changes in runtime behavior
diff --git a/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java b/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java
index 485dd8a1ce5..a4ed8407f2f 100755
--- a/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java
+++ b/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java
@@ -18,8 +18,10 @@ package org.apache.lucene.search.spell;
*/
import java.io.IOException;
+import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
+import java.util.List;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
@@ -30,6 +32,8 @@ import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
@@ -38,7 +42,10 @@ import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.ReaderUtil;
import org.apache.lucene.util.Version;
+import org.apache.lucene.util.VirtualMethod;
/**
*
@@ -492,35 +499,64 @@ public class SpellChecker implements java.io.Closeable {
* @param dict Dictionary to index
* @param mergeFactor mergeFactor to use when indexing
* @param ramMB the max amount or memory in MB to use
+ * @param optimize whether or not the spellcheck index should be optimized
* @throws AlreadyClosedException if the Spellchecker is already closed
* @throws IOException
*/
- public void indexDictionary(Dictionary dict, int mergeFactor, int ramMB) throws IOException {
+ public final void indexDictionary(Dictionary dict, int mergeFactor, int ramMB, boolean optimize) throws IOException {
synchronized (modifyCurrentIndexLock) {
ensureOpen();
final Directory dir = this.spellIndex;
final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, new WhitespaceAnalyzer(Version.LUCENE_CURRENT)).setRAMBufferSizeMB(ramMB));
((LogMergePolicy) writer.getConfig().getMergePolicy()).setMergeFactor(mergeFactor);
+ IndexSearcher indexSearcher = obtainSearcher();
+ final List termsEnums = new ArrayList();
+
+ if (searcher.maxDoc() > 0) {
+ new ReaderUtil.Gather(searcher.getIndexReader()) {
+ @Override
+ protected void add(int base, IndexReader r) throws IOException {
+ Terms terms = r.terms(F_WORD);
+ if (terms != null)
+ termsEnums.add(terms.iterator());
+ }
+ }.run();
+ }
+
+ boolean isEmpty = termsEnums.isEmpty();
+
+ try {
+ Iterator iter = dict.getWordsIterator();
+ BytesRef currentTerm = new BytesRef();
+
+ terms: while (iter.hasNext()) {
+ String word = iter.next();
- Iterator iter = dict.getWordsIterator();
- while (iter.hasNext()) {
- String word = iter.next();
+ int len = word.length();
+ if (len < 3) {
+ continue; // too short we bail but "too long" is fine...
+ }
- int len = word.length();
- if (len < 3) {
- continue; // too short we bail but "too long" is fine...
+ if (!isEmpty) {
+ // we have a non-empty index, check if the term exists
+ currentTerm.copy(word);
+ for (TermsEnum te : termsEnums) {
+ if (te.seek(currentTerm, false) == TermsEnum.SeekStatus.FOUND) {
+ continue terms;
+ }
+ }
+ }
+
+ // ok index the word
+ Document doc = createDocument(word, getMin(len), getMax(len));
+ writer.addDocument(doc);
}
-
- if (this.exist(word)) { // if the word already exist in the gramindex
- continue;
- }
-
- // ok index the word
- Document doc = createDocument(word, getMin(len), getMax(len));
- writer.addDocument(doc);
+ } finally {
+ releaseSearcher(indexSearcher);
}
// close writer
- writer.optimize();
+ if (optimize)
+ writer.optimize();
writer.close();
// also re-open the spell index to see our own changes when the next suggestion
// is fetched:
@@ -531,10 +567,21 @@ public class SpellChecker implements java.io.Closeable {
/**
* Indexes the data from the given {@link Dictionary}.
* @param dict the dictionary to index
+ * @param mergeFactor mergeFactor to use when indexing
+ * @param ramMB the max amount or memory in MB to use
* @throws IOException
*/
- public void indexDictionary(Dictionary dict) throws IOException {
- indexDictionary(dict, 300, 10);
+ public final void indexDictionary(Dictionary dict, int mergeFactor, int ramMB) throws IOException {
+ indexDictionary(dict, mergeFactor, ramMB, true);
+ }
+
+ /**
+ * Indexes the data from the given {@link Dictionary}.
+ * @param dict the dictionary to index
+ * @throws IOException
+ */
+ public final void indexDictionary(Dictionary dict) throws IOException {
+ indexDictionary(dict, 300, (int)IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB);
}
private static int getMin(int l) {
@@ -559,7 +606,12 @@ public class SpellChecker implements java.io.Closeable {
private static Document createDocument(String text, int ng1, int ng2) {
Document doc = new Document();
- doc.add(new Field(F_WORD, text, Field.Store.YES, Field.Index.NOT_ANALYZED)); // orig term
+ // the word field is never queried on... its indexed so it can be quickly
+ // checked for rebuild (and stored for retrieval). Doesn't need norms or TF/pos
+ Field f = new Field(F_WORD, text, Field.Store.YES, Field.Index.NOT_ANALYZED);
+ f.setOmitTermFreqAndPositions(true);
+ f.setOmitNorms(true);
+ doc.add(f); // orig term
addGram(text, doc, ng1, ng2);
return doc;
}
@@ -573,12 +625,20 @@ public class SpellChecker implements java.io.Closeable {
String gram = text.substring(i, i + ng);
doc.add(new Field(key, gram, Field.Store.NO, Field.Index.NOT_ANALYZED));
if (i == 0) {
- doc.add(new Field("start" + ng, gram, Field.Store.NO, Field.Index.NOT_ANALYZED));
+ // only one term possible in the startXXField, TF/pos and norms aren't needed.
+ Field startField = new Field("start" + ng, gram, Field.Store.NO, Field.Index.NOT_ANALYZED);
+ startField.setOmitTermFreqAndPositions(true);
+ startField.setOmitNorms(true);
+ doc.add(startField);
}
end = gram;
}
if (end != null) { // may not be present if len==ng1
- doc.add(new Field("end" + ng, end, Field.Store.NO, Field.Index.NOT_ANALYZED));
+ // only one term possible in the endXXField, TF/pos and norms aren't needed.
+ Field endField = new Field("end" + ng, end, Field.Store.NO, Field.Index.NOT_ANALYZED);
+ endField.setOmitTermFreqAndPositions(true);
+ endField.setOmitNorms(true);
+ doc.add(endField);
}
}
}