LUCENE-2391: improve Spellchecker indexing speed

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1055285 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-01-05 03:16:56 +00:00
parent 4061877e43
commit c26fd7564a
2 changed files with 89 additions and 21 deletions

View File

@ -79,6 +79,14 @@ Changes in backwards compatibility policy
* LUCENE-2581: Added new methods to FragmentsBuilder interface. These methods
are used to set pre/post tags and Encoder. (Koji Sekiguchi)
* LUCENE-2391: Improved spellchecker (re)build time/ram usage by omitting
frequencies/positions/norms for single-valued fields, modifying the default
ramBufferMBSize to match IndexWriterConfig (16MB), making index optimization
an optional boolean parameter, and modifying the incremental update logic
to work well with unoptimized spellcheck indexes. The indexDictionary() methods
were made final to ensure a hard backwards break in case you were subclassing
Spellchecker. In general, subclassing Spellchecker is not recommended. (Robert Muir)
Changes in runtime behavior
* LUCENE-2117: SnowballAnalyzer uses TurkishLowerCaseFilter instead of

View File

@ -18,8 +18,10 @@ package org.apache.lucene.search.spell;
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
@ -30,6 +32,8 @@ import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
@ -38,7 +42,10 @@ import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.ReaderUtil;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.VirtualMethod;
/**
* <p>
@ -492,18 +499,37 @@ public class SpellChecker implements java.io.Closeable {
* @param dict Dictionary to index
* @param mergeFactor mergeFactor to use when indexing
* @param ramMB the max amount or memory in MB to use
* @param optimize whether or not the spellcheck index should be optimized
* @throws AlreadyClosedException if the Spellchecker is already closed
* @throws IOException
*/
public void indexDictionary(Dictionary dict, int mergeFactor, int ramMB) throws IOException {
public final void indexDictionary(Dictionary dict, int mergeFactor, int ramMB, boolean optimize) throws IOException {
synchronized (modifyCurrentIndexLock) {
ensureOpen();
final Directory dir = this.spellIndex;
final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, new WhitespaceAnalyzer(Version.LUCENE_CURRENT)).setRAMBufferSizeMB(ramMB));
((LogMergePolicy) writer.getConfig().getMergePolicy()).setMergeFactor(mergeFactor);
IndexSearcher indexSearcher = obtainSearcher();
final List<TermsEnum> termsEnums = new ArrayList<TermsEnum>();
if (searcher.maxDoc() > 0) {
new ReaderUtil.Gather(searcher.getIndexReader()) {
@Override
protected void add(int base, IndexReader r) throws IOException {
Terms terms = r.terms(F_WORD);
if (terms != null)
termsEnums.add(terms.iterator());
}
}.run();
}
boolean isEmpty = termsEnums.isEmpty();
try {
Iterator<String> iter = dict.getWordsIterator();
while (iter.hasNext()) {
BytesRef currentTerm = new BytesRef();
terms: while (iter.hasNext()) {
String word = iter.next();
int len = word.length();
@ -511,15 +537,25 @@ public class SpellChecker implements java.io.Closeable {
continue; // too short we bail but "too long" is fine...
}
if (this.exist(word)) { // if the word already exist in the gramindex
continue;
if (!isEmpty) {
// we have a non-empty index, check if the term exists
currentTerm.copy(word);
for (TermsEnum te : termsEnums) {
if (te.seek(currentTerm, false) == TermsEnum.SeekStatus.FOUND) {
continue terms;
}
}
}
// ok index the word
Document doc = createDocument(word, getMin(len), getMax(len));
writer.addDocument(doc);
}
} finally {
releaseSearcher(indexSearcher);
}
// close writer
if (optimize)
writer.optimize();
writer.close();
// also re-open the spell index to see our own changes when the next suggestion
@ -531,10 +567,21 @@ public class SpellChecker implements java.io.Closeable {
/**
* Indexes the data from the given {@link Dictionary}.
* @param dict the dictionary to index
* @param mergeFactor mergeFactor to use when indexing
* @param ramMB the max amount or memory in MB to use
* @throws IOException
*/
public void indexDictionary(Dictionary dict) throws IOException {
indexDictionary(dict, 300, 10);
public final void indexDictionary(Dictionary dict, int mergeFactor, int ramMB) throws IOException {
indexDictionary(dict, mergeFactor, ramMB, true);
}
/**
* Indexes the data from the given {@link Dictionary}.
* @param dict the dictionary to index
* @throws IOException
*/
public final void indexDictionary(Dictionary dict) throws IOException {
indexDictionary(dict, 300, (int)IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB);
}
private static int getMin(int l) {
@ -559,7 +606,12 @@ public class SpellChecker implements java.io.Closeable {
private static Document createDocument(String text, int ng1, int ng2) {
Document doc = new Document();
doc.add(new Field(F_WORD, text, Field.Store.YES, Field.Index.NOT_ANALYZED)); // orig term
// the word field is never queried on... its indexed so it can be quickly
// checked for rebuild (and stored for retrieval). Doesn't need norms or TF/pos
Field f = new Field(F_WORD, text, Field.Store.YES, Field.Index.NOT_ANALYZED);
f.setOmitTermFreqAndPositions(true);
f.setOmitNorms(true);
doc.add(f); // orig term
addGram(text, doc, ng1, ng2);
return doc;
}
@ -573,12 +625,20 @@ public class SpellChecker implements java.io.Closeable {
String gram = text.substring(i, i + ng);
doc.add(new Field(key, gram, Field.Store.NO, Field.Index.NOT_ANALYZED));
if (i == 0) {
doc.add(new Field("start" + ng, gram, Field.Store.NO, Field.Index.NOT_ANALYZED));
// only one term possible in the startXXField, TF/pos and norms aren't needed.
Field startField = new Field("start" + ng, gram, Field.Store.NO, Field.Index.NOT_ANALYZED);
startField.setOmitTermFreqAndPositions(true);
startField.setOmitNorms(true);
doc.add(startField);
}
end = gram;
}
if (end != null) { // may not be present if len==ng1
doc.add(new Field("end" + ng, end, Field.Store.NO, Field.Index.NOT_ANALYZED));
// only one term possible in the endXXField, TF/pos and norms aren't needed.
Field endField = new Field("end" + ng, end, Field.Store.NO, Field.Index.NOT_ANALYZED);
endField.setOmitTermFreqAndPositions(true);
endField.setOmitNorms(true);
doc.add(endField);
}
}
}