mirror of https://github.com/apache/lucene.git
LUCENE-2391: improve Spellchecker indexing speed
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1055285 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4061877e43
commit
c26fd7564a
|
@ -79,6 +79,14 @@ Changes in backwards compatibility policy
|
||||||
* LUCENE-2581: Added new methods to FragmentsBuilder interface. These methods
|
* LUCENE-2581: Added new methods to FragmentsBuilder interface. These methods
|
||||||
are used to set pre/post tags and Encoder. (Koji Sekiguchi)
|
are used to set pre/post tags and Encoder. (Koji Sekiguchi)
|
||||||
|
|
||||||
|
* LUCENE-2391: Improved spellchecker (re)build time/ram usage by omitting
|
||||||
|
frequencies/positions/norms for single-valued fields, modifying the default
|
||||||
|
ramBufferMBSize to match IndexWriterConfig (16MB), making index optimization
|
||||||
|
an optional boolean parameter, and modifying the incremental update logic
|
||||||
|
to work well with unoptimized spellcheck indexes. The indexDictionary() methods
|
||||||
|
were made final to ensure a hard backwards break in case you were subclassing
|
||||||
|
Spellchecker. In general, subclassing Spellchecker is not recommended. (Robert Muir)
|
||||||
|
|
||||||
Changes in runtime behavior
|
Changes in runtime behavior
|
||||||
|
|
||||||
* LUCENE-2117: SnowballAnalyzer uses TurkishLowerCaseFilter instead of
|
* LUCENE-2117: SnowballAnalyzer uses TurkishLowerCaseFilter instead of
|
||||||
|
|
|
@ -18,8 +18,10 @@ package org.apache.lucene.search.spell;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
|
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
|
@ -30,6 +32,8 @@ import org.apache.lucene.index.IndexWriterConfig;
|
||||||
import org.apache.lucene.index.LogMergePolicy;
|
import org.apache.lucene.index.LogMergePolicy;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||||
|
import org.apache.lucene.index.Terms;
|
||||||
|
import org.apache.lucene.index.TermsEnum;
|
||||||
import org.apache.lucene.search.BooleanClause;
|
import org.apache.lucene.search.BooleanClause;
|
||||||
import org.apache.lucene.search.BooleanQuery;
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
@ -38,7 +42,10 @@ import org.apache.lucene.search.ScoreDoc;
|
||||||
import org.apache.lucene.search.TermQuery;
|
import org.apache.lucene.search.TermQuery;
|
||||||
import org.apache.lucene.store.AlreadyClosedException;
|
import org.apache.lucene.store.AlreadyClosedException;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.ReaderUtil;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
import org.apache.lucene.util.VirtualMethod;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <p>
|
* <p>
|
||||||
|
@ -492,18 +499,37 @@ public class SpellChecker implements java.io.Closeable {
|
||||||
* @param dict Dictionary to index
|
* @param dict Dictionary to index
|
||||||
* @param mergeFactor mergeFactor to use when indexing
|
* @param mergeFactor mergeFactor to use when indexing
|
||||||
* @param ramMB the max amount or memory in MB to use
|
* @param ramMB the max amount or memory in MB to use
|
||||||
|
* @param optimize whether or not the spellcheck index should be optimized
|
||||||
* @throws AlreadyClosedException if the Spellchecker is already closed
|
* @throws AlreadyClosedException if the Spellchecker is already closed
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
public void indexDictionary(Dictionary dict, int mergeFactor, int ramMB) throws IOException {
|
public final void indexDictionary(Dictionary dict, int mergeFactor, int ramMB, boolean optimize) throws IOException {
|
||||||
synchronized (modifyCurrentIndexLock) {
|
synchronized (modifyCurrentIndexLock) {
|
||||||
ensureOpen();
|
ensureOpen();
|
||||||
final Directory dir = this.spellIndex;
|
final Directory dir = this.spellIndex;
|
||||||
final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, new WhitespaceAnalyzer(Version.LUCENE_CURRENT)).setRAMBufferSizeMB(ramMB));
|
final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_CURRENT, new WhitespaceAnalyzer(Version.LUCENE_CURRENT)).setRAMBufferSizeMB(ramMB));
|
||||||
((LogMergePolicy) writer.getConfig().getMergePolicy()).setMergeFactor(mergeFactor);
|
((LogMergePolicy) writer.getConfig().getMergePolicy()).setMergeFactor(mergeFactor);
|
||||||
|
IndexSearcher indexSearcher = obtainSearcher();
|
||||||
|
final List<TermsEnum> termsEnums = new ArrayList<TermsEnum>();
|
||||||
|
|
||||||
|
if (searcher.maxDoc() > 0) {
|
||||||
|
new ReaderUtil.Gather(searcher.getIndexReader()) {
|
||||||
|
@Override
|
||||||
|
protected void add(int base, IndexReader r) throws IOException {
|
||||||
|
Terms terms = r.terms(F_WORD);
|
||||||
|
if (terms != null)
|
||||||
|
termsEnums.add(terms.iterator());
|
||||||
|
}
|
||||||
|
}.run();
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean isEmpty = termsEnums.isEmpty();
|
||||||
|
|
||||||
|
try {
|
||||||
Iterator<String> iter = dict.getWordsIterator();
|
Iterator<String> iter = dict.getWordsIterator();
|
||||||
while (iter.hasNext()) {
|
BytesRef currentTerm = new BytesRef();
|
||||||
|
|
||||||
|
terms: while (iter.hasNext()) {
|
||||||
String word = iter.next();
|
String word = iter.next();
|
||||||
|
|
||||||
int len = word.length();
|
int len = word.length();
|
||||||
|
@ -511,15 +537,25 @@ public class SpellChecker implements java.io.Closeable {
|
||||||
continue; // too short we bail but "too long" is fine...
|
continue; // too short we bail but "too long" is fine...
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.exist(word)) { // if the word already exist in the gramindex
|
if (!isEmpty) {
|
||||||
continue;
|
// we have a non-empty index, check if the term exists
|
||||||
|
currentTerm.copy(word);
|
||||||
|
for (TermsEnum te : termsEnums) {
|
||||||
|
if (te.seek(currentTerm, false) == TermsEnum.SeekStatus.FOUND) {
|
||||||
|
continue terms;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ok index the word
|
// ok index the word
|
||||||
Document doc = createDocument(word, getMin(len), getMax(len));
|
Document doc = createDocument(word, getMin(len), getMax(len));
|
||||||
writer.addDocument(doc);
|
writer.addDocument(doc);
|
||||||
}
|
}
|
||||||
|
} finally {
|
||||||
|
releaseSearcher(indexSearcher);
|
||||||
|
}
|
||||||
// close writer
|
// close writer
|
||||||
|
if (optimize)
|
||||||
writer.optimize();
|
writer.optimize();
|
||||||
writer.close();
|
writer.close();
|
||||||
// also re-open the spell index to see our own changes when the next suggestion
|
// also re-open the spell index to see our own changes when the next suggestion
|
||||||
|
@ -531,10 +567,21 @@ public class SpellChecker implements java.io.Closeable {
|
||||||
/**
|
/**
|
||||||
* Indexes the data from the given {@link Dictionary}.
|
* Indexes the data from the given {@link Dictionary}.
|
||||||
* @param dict the dictionary to index
|
* @param dict the dictionary to index
|
||||||
|
* @param mergeFactor mergeFactor to use when indexing
|
||||||
|
* @param ramMB the max amount or memory in MB to use
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
public void indexDictionary(Dictionary dict) throws IOException {
|
public final void indexDictionary(Dictionary dict, int mergeFactor, int ramMB) throws IOException {
|
||||||
indexDictionary(dict, 300, 10);
|
indexDictionary(dict, mergeFactor, ramMB, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Indexes the data from the given {@link Dictionary}.
|
||||||
|
* @param dict the dictionary to index
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
public final void indexDictionary(Dictionary dict) throws IOException {
|
||||||
|
indexDictionary(dict, 300, (int)IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static int getMin(int l) {
|
private static int getMin(int l) {
|
||||||
|
@ -559,7 +606,12 @@ public class SpellChecker implements java.io.Closeable {
|
||||||
|
|
||||||
private static Document createDocument(String text, int ng1, int ng2) {
|
private static Document createDocument(String text, int ng1, int ng2) {
|
||||||
Document doc = new Document();
|
Document doc = new Document();
|
||||||
doc.add(new Field(F_WORD, text, Field.Store.YES, Field.Index.NOT_ANALYZED)); // orig term
|
// the word field is never queried on... its indexed so it can be quickly
|
||||||
|
// checked for rebuild (and stored for retrieval). Doesn't need norms or TF/pos
|
||||||
|
Field f = new Field(F_WORD, text, Field.Store.YES, Field.Index.NOT_ANALYZED);
|
||||||
|
f.setOmitTermFreqAndPositions(true);
|
||||||
|
f.setOmitNorms(true);
|
||||||
|
doc.add(f); // orig term
|
||||||
addGram(text, doc, ng1, ng2);
|
addGram(text, doc, ng1, ng2);
|
||||||
return doc;
|
return doc;
|
||||||
}
|
}
|
||||||
|
@ -573,12 +625,20 @@ public class SpellChecker implements java.io.Closeable {
|
||||||
String gram = text.substring(i, i + ng);
|
String gram = text.substring(i, i + ng);
|
||||||
doc.add(new Field(key, gram, Field.Store.NO, Field.Index.NOT_ANALYZED));
|
doc.add(new Field(key, gram, Field.Store.NO, Field.Index.NOT_ANALYZED));
|
||||||
if (i == 0) {
|
if (i == 0) {
|
||||||
doc.add(new Field("start" + ng, gram, Field.Store.NO, Field.Index.NOT_ANALYZED));
|
// only one term possible in the startXXField, TF/pos and norms aren't needed.
|
||||||
|
Field startField = new Field("start" + ng, gram, Field.Store.NO, Field.Index.NOT_ANALYZED);
|
||||||
|
startField.setOmitTermFreqAndPositions(true);
|
||||||
|
startField.setOmitNorms(true);
|
||||||
|
doc.add(startField);
|
||||||
}
|
}
|
||||||
end = gram;
|
end = gram;
|
||||||
}
|
}
|
||||||
if (end != null) { // may not be present if len==ng1
|
if (end != null) { // may not be present if len==ng1
|
||||||
doc.add(new Field("end" + ng, end, Field.Store.NO, Field.Index.NOT_ANALYZED));
|
// only one term possible in the endXXField, TF/pos and norms aren't needed.
|
||||||
|
Field endField = new Field("end" + ng, end, Field.Store.NO, Field.Index.NOT_ANALYZED);
|
||||||
|
endField.setOmitTermFreqAndPositions(true);
|
||||||
|
endField.setOmitNorms(true);
|
||||||
|
doc.add(endField);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue