diff --git a/contrib/CHANGES.txt b/contrib/CHANGES.txt index cfed2931e4e..403a66a81b6 100644 --- a/contrib/CHANGES.txt +++ b/contrib/CHANGES.txt @@ -17,6 +17,12 @@ API Changes New features + * LUCENE-2108: Spellchecker now safely supports concurrent modifications to + the spell-index. Threads can safely obtain term suggestions while the spell- + index is rebuild, cleared or reset. Internal IndexSearcher instances remain + open until the last thread accessing them releases the reference. + (Simon Willnauer) + * LUCENE-2067: Add a Czech light stemmer. CzechAnalyzer will now stem words when Version is set to 3.1 or higher. (Robert Muir) diff --git a/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java b/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java index af229a92730..8bf8b5fe495 100755 --- a/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java +++ b/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java @@ -32,6 +32,7 @@ import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; /** @@ -60,10 +61,14 @@ public class SpellChecker { * Field name for each word in the ngram index. */ public static final String F_WORD = "word"; + + private static final Term F_WORD_TERM = new Term(F_WORD); /** * the spell index */ + // don't modify the directory directly - see #swapSearcher() + // TODO: why is this package private? Directory spellIndex; /** @@ -72,7 +77,22 @@ public class SpellChecker { private float bStart = 2.0f; private float bEnd = 1.0f; + // don't use this searcher directly - see #swapSearcher() private IndexSearcher searcher; + + /* + * this locks all modifications to the current searcher. + */ + private final Object searcherLock = new Object(); + + /* + * this lock synchronizes all possible modifications to the + * current index directory. It should not be possible to try modifying + * the same index concurrently. Note: Do not acquire the searcher lock + * before acquiring this lock! + */ + private final Object modifyCurrentIndexLock = new Object(); + private volatile boolean closed = false; // minimum score for hits generated by the spell checker query private float minScore = 0.5f; @@ -82,15 +102,24 @@ public class SpellChecker { /** * Use the given directory as a spell checker index. The directory * is created if it doesn't exist yet. + * @param spellIndex the spell index directory + * @param sd the {@link StringDistance} measurement to use + * @throws IOException if Spellchecker can not open the directory + */ + public SpellChecker(Directory spellIndex, StringDistance sd) throws IOException { + setSpellIndex(spellIndex); + setStringDistance(sd); + } + /** + * Use the given directory as a spell checker index with a + * {@link LevensteinDistance} as the default {@link StringDistance}. The + * directory is created if it doesn't exist yet. * * @param spellIndex + * the spell index directory * @throws IOException + * if spellchecker can not open the directory */ - public SpellChecker(Directory spellIndex,StringDistance sd) throws IOException { - this.setSpellIndex(spellIndex); - this.setStringDistance(sd); - } - public SpellChecker(Directory spellIndex) throws IOException { this(spellIndex, new LevensteinDistance()); } @@ -99,27 +128,41 @@ public class SpellChecker { * Use a different index as the spell checker index or re-open * the existing index if spellIndex is the same value * as given in the constructor. - * - * @param spellIndex - * @throws IOException + * @param spellIndexDir the spell directory to use + * @throws AlreadyClosedException if the Spellchecker is already closed + * @throws IOException if spellchecker can not open the directory */ - public void setSpellIndex(Directory spellIndex) throws IOException { - this.spellIndex = spellIndex; - if (!IndexReader.indexExists(spellIndex)) { - IndexWriter writer = new IndexWriter(spellIndex, null, true, IndexWriter.MaxFieldLength.UNLIMITED); - writer.close(); + // TODO: we should make this final as it is called in the constructor + public void setSpellIndex(Directory spellIndexDir) throws IOException { + // this could be the same directory as the current spellIndex + // modifications to the directory should be synchronized + synchronized (modifyCurrentIndexLock) { + ensureOpen(); + if (!IndexReader.indexExists(spellIndexDir)) { + IndexWriter writer = new IndexWriter(spellIndexDir, null, true, + IndexWriter.MaxFieldLength.UNLIMITED); + writer.close(); + } + swapSearcher(spellIndexDir); } - // close the old searcher, if there was one - if (searcher != null) { - searcher.close(); - } - searcher = new IndexSearcher(this.spellIndex, true); } - + /** + * Sets the {@link StringDistance} implementation for this + * {@link SpellChecker} instance. + * + * @param sd the {@link StringDistance} implementation for this + * {@link SpellChecker} instance + */ public void setStringDistance(StringDistance sd) { this.sd = sd; } - + /** + * Returns the {@link StringDistance} instance used by this + * {@link SpellChecker} instance. + * + * @return the {@link StringDistance} instance used by this + * {@link SpellChecker} instance. + */ public StringDistance getStringDistance() { return sd; } @@ -144,7 +187,8 @@ public class SpellChecker { * * @param word the word you want a spell check done on * @param numSug the number of suggested words - * @throws IOException + * @throws IOException if the underlying index throws an {@link IOException} + * @throws AlreadyClosedException if the Spellchecker is already closed * @return String[] */ public String[] suggestSimilar(String word, int numSug) throws IOException { @@ -169,98 +213,104 @@ public class SpellChecker { * words are restricted to the words present in this field. * @param morePopular return only the suggest words that are as frequent or more frequent than the searched word * (only if restricted mode = (indexReader!=null and field!=null) - * @throws IOException + * @throws IOException if the underlying index throws an {@link IOException} + * @throws AlreadyClosedException if the Spellchecker is already closed * @return String[] the sorted list of the suggest words with these 2 criteria: * first criteria: the edit distance, second criteria (only if restricted mode): the popularity * of the suggest words in the field of the user index */ public String[] suggestSimilar(String word, int numSug, IndexReader ir, String field, boolean morePopular) throws IOException { - - float min = this.minScore; - final int lengthWord = word.length(); - - final int freq = (ir != null && field != null) ? ir.docFreq(new Term(field, word)) : 0; - final int goalFreq = (morePopular && ir != null && field != null) ? freq : 0; - // if the word exists in the real index and we don't care for word frequency, return the word itself - if (!morePopular && freq > 0) { - return new String[] { word }; - } - - BooleanQuery query = new BooleanQuery(); - String[] grams; - String key; - - for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) { - - key = "gram" + ng; // form key - - grams = formGrams(word, ng); // form word into ngrams (allow dups too) - - if (grams.length == 0) { - continue; // hmm + // obtainSearcher calls ensureOpen + final IndexSearcher indexSearcher = obtainSearcher(); + try{ + float min = this.minScore; + final int lengthWord = word.length(); + + final int freq = (ir != null && field != null) ? ir.docFreq(new Term(field, word)) : 0; + final int goalFreq = (morePopular && ir != null && field != null) ? freq : 0; + // if the word exists in the real index and we don't care for word frequency, return the word itself + if (!morePopular && freq > 0) { + return new String[] { word }; } - - if (bStart > 0) { // should we boost prefixes? - add(query, "start" + ng, grams[0], bStart); // matches start of word - - } - if (bEnd > 0) { // should we boost suffixes - add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word - - } - for (int i = 0; i < grams.length; i++) { - add(query, key, grams[i]); - } - } - - int maxHits = 10 * numSug; - -// System.out.println("Q: " + query); - ScoreDoc[] hits = searcher.search(query, null, maxHits).scoreDocs; -// System.out.println("HITS: " + hits.length()); - SuggestWordQueue sugQueue = new SuggestWordQueue(numSug); - - // go thru more than 'maxr' matches in case the distance filter triggers - int stop = Math.min(hits.length, maxHits); - SuggestWord sugWord = new SuggestWord(); - for (int i = 0; i < stop; i++) { - - sugWord.string = searcher.doc(hits[i].doc).get(F_WORD); // get orig word - - // don't suggest a word for itself, that would be silly - if (sugWord.string.equals(word)) { - continue; - } - - // edit distance - sugWord.score = sd.getDistance(word,sugWord.string); - if (sugWord.score < min) { - continue; - } - - if (ir != null && field != null) { // use the user index - sugWord.freq = ir.docFreq(new Term(field, sugWord.string)); // freq in the index - // don't suggest a word that is not present in the field - if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1) { - continue; + + BooleanQuery query = new BooleanQuery(); + String[] grams; + String key; + + for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) { + + key = "gram" + ng; // form key + + grams = formGrams(word, ng); // form word into ngrams (allow dups too) + + if (grams.length == 0) { + continue; // hmm + } + + if (bStart > 0) { // should we boost prefixes? + add(query, "start" + ng, grams[0], bStart); // matches start of word + + } + if (bEnd > 0) { // should we boost suffixes + add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word + + } + for (int i = 0; i < grams.length; i++) { + add(query, key, grams[i]); } } - sugQueue.insertWithOverflow(sugWord); - if (sugQueue.size() == numSug) { - // if queue full, maintain the minScore score - min = sugQueue.top().score; + + int maxHits = 10 * numSug; + + // System.out.println("Q: " + query); + ScoreDoc[] hits = indexSearcher.search(query, null, maxHits).scoreDocs; + // System.out.println("HITS: " + hits.length()); + SuggestWordQueue sugQueue = new SuggestWordQueue(numSug); + + // go thru more than 'maxr' matches in case the distance filter triggers + int stop = Math.min(hits.length, maxHits); + SuggestWord sugWord = new SuggestWord(); + for (int i = 0; i < stop; i++) { + + sugWord.string = indexSearcher.doc(hits[i].doc).get(F_WORD); // get orig word + + // don't suggest a word for itself, that would be silly + if (sugWord.string.equals(word)) { + continue; + } + + // edit distance + sugWord.score = sd.getDistance(word,sugWord.string); + if (sugWord.score < min) { + continue; + } + + if (ir != null && field != null) { // use the user index + sugWord.freq = ir.docFreq(new Term(field, sugWord.string)); // freq in the index + // don't suggest a word that is not present in the field + if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1) { + continue; + } + } + sugQueue.insertWithOverflow(sugWord); + if (sugQueue.size() == numSug) { + // if queue full, maintain the minScore score + min = sugQueue.top().score; + } + sugWord = new SuggestWord(); } - sugWord = new SuggestWord(); + + // convert to array string + String[] list = new String[sugQueue.size()]; + for (int i = sugQueue.size() - 1; i >= 0; i--) { + list[i] = sugQueue.pop().string; + } + + return list; + } finally { + releaseSearcher(indexSearcher); } - - // convert to array string - String[] list = new String[sugQueue.size()]; - for (int i = sugQueue.size() - 1; i >= 0; i--) { - list[i] = sugQueue.pop().string; - } - - return list; } /** @@ -297,24 +347,33 @@ public class SpellChecker { /** * Removes all terms from the spell check index. * @throws IOException + * @throws AlreadyClosedException if the Spellchecker is already closed */ public void clearIndex() throws IOException { - IndexWriter writer = new IndexWriter(spellIndex, null, true, IndexWriter.MaxFieldLength.UNLIMITED); - writer.close(); - - //close the old searcher - searcher.close(); - searcher = new IndexSearcher(this.spellIndex, true); + synchronized (modifyCurrentIndexLock) { + ensureOpen(); + final Directory dir = this.spellIndex; + final IndexWriter writer = new IndexWriter(dir, null, true, IndexWriter.MaxFieldLength.UNLIMITED); + writer.close(); + swapSearcher(dir); + } } /** * Check whether the word exists in the index. * @param word * @throws IOException - * @return true iff the word exists in the index + * @throws AlreadyClosedException if the Spellchecker is already closed + * @return true if the word exists in the index */ public boolean exist(String word) throws IOException { - return searcher.docFreq(new Term(F_WORD, word)) > 0; + // obtainSearcher calls ensureOpen + final IndexSearcher indexSearcher = obtainSearcher(); + try{ + return indexSearcher.docFreq(F_WORD_TERM.createTerm(word)) > 0; + } finally { + releaseSearcher(indexSearcher); + } } /** @@ -322,37 +381,42 @@ public class SpellChecker { * @param dict Dictionary to index * @param mergeFactor mergeFactor to use when indexing * @param ramMB the max amount or memory in MB to use + * @throws AlreadyClosedException if the Spellchecker is already closed * @throws IOException */ public void indexDictionary(Dictionary dict, int mergeFactor, int ramMB) throws IOException { - IndexWriter writer = new IndexWriter(spellIndex, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); - writer.setMergeFactor(mergeFactor); - writer.setRAMBufferSizeMB(ramMB); - - Iterator iter = dict.getWordsIterator(); - while (iter.hasNext()) { - String word = iter.next(); - - int len = word.length(); - if (len < 3) { - continue; // too short we bail but "too long" is fine... + synchronized (modifyCurrentIndexLock) { + ensureOpen(); + final Directory dir = this.spellIndex; + final IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), + IndexWriter.MaxFieldLength.UNLIMITED); + writer.setMergeFactor(mergeFactor); + writer.setRAMBufferSizeMB(ramMB); + + Iterator iter = dict.getWordsIterator(); + while (iter.hasNext()) { + String word = iter.next(); + + int len = word.length(); + if (len < 3) { + continue; // too short we bail but "too long" is fine... + } + + if (this.exist(word)) { // if the word already exist in the gramindex + continue; + } + + // ok index the word + Document doc = createDocument(word, getMin(len), getMax(len)); + writer.addDocument(doc); } - - if (this.exist(word)) { // if the word already exist in the gramindex - continue; - } - - // ok index the word - Document doc = createDocument(word, getMin(len), getMax(len)); - writer.addDocument(doc); + // close writer + writer.optimize(); + writer.close(); + // also re-open the spell index to see our own changes when the next suggestion + // is fetched: + swapSearcher(dir); } - // close writer - writer.optimize(); - writer.close(); - // also re-open the spell index to see our own changes when the next suggestion - // is fetched: - searcher.close(); - searcher = new IndexSearcher(this.spellIndex, true); } /** @@ -364,7 +428,7 @@ public class SpellChecker { indexDictionary(dict, 300, 10); } - private int getMin(int l) { + private static int getMin(int l) { if (l > 5) { return 3; } @@ -374,7 +438,7 @@ public class SpellChecker { return 1; } - private int getMax(int l) { + private static int getMax(int l) { if (l > 5) { return 4; } @@ -409,12 +473,84 @@ public class SpellChecker { } } } - + + private IndexSearcher obtainSearcher() { + synchronized (searcherLock) { + ensureOpen(); + searcher.getIndexReader().incRef(); + return searcher; + } + } + + private void releaseSearcher(final IndexSearcher aSearcher) throws IOException{ + // don't check if open - always decRef + // don't decrement the private searcher - could have been swapped + aSearcher.getIndexReader().decRef(); + } + + private void ensureOpen() { + if (closed) { + throw new AlreadyClosedException("Spellchecker has been closed"); + } + } + /** - * Close the IndexSearcher used by this SpellChecker. + * Close the IndexSearcher used by this SpellChecker + * @throws IOException if the close operation causes an {@link IOException} + * @throws AlreadyClosedException if the {@link SpellChecker} is already closed */ public void close() throws IOException { - searcher.close(); - searcher = null; + synchronized (searcherLock) { + ensureOpen(); + closed = true; + if (searcher != null) { + searcher.close(); + } + searcher = null; + } } + + private void swapSearcher(final Directory dir) throws IOException { + /* + * opening a searcher is possibly very expensive. + * We rather close it again if the Spellchecker was closed during + * this operation than block access to the current searcher while opening. + */ + final IndexSearcher indexSearcher = createSearcher(dir); + synchronized (searcherLock) { + if(closed){ + indexSearcher.close(); + throw new AlreadyClosedException("Spellchecker has been closed"); + } + if (searcher != null) { + searcher.close(); + } + // set the spellindex in the sync block - ensure consistency. + searcher = indexSearcher; + this.spellIndex = dir; + } + } + + /** + * Creates a new read-only IndexSearcher + * @param dir the directory used to open the searcher + * @return a new read-only IndexSearcher + * @throws IOException f there is a low-level IO error + */ + // for testing purposes + IndexSearcher createSearcher(final Directory dir) throws IOException{ + return new IndexSearcher(dir, true); + } + + /** + * Returns true if and only if the {@link SpellChecker} is + * closed, otherwise false. + * + * @return true if and only if the {@link SpellChecker} is + * closed, otherwise false. + */ + boolean isClosed(){ + return closed; + } + } diff --git a/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java b/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java index 995ee1ae520..e766f2a29d7 100755 --- a/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java +++ b/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java @@ -18,8 +18,13 @@ package org.apache.lucene.search.spell; */ import java.io.IOException; - -import junit.framework.TestCase; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Random; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; import org.apache.lucene.analysis.SimpleAnalyzer; import org.apache.lucene.document.Document; @@ -27,9 +32,12 @@ import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.English; +import org.apache.lucene.util.LuceneTestCase; /** @@ -37,9 +45,11 @@ import org.apache.lucene.util.English; * * */ -public class TestSpellChecker extends TestCase { - private SpellChecker spellChecker; +public class TestSpellChecker extends LuceneTestCase { + private SpellCheckerMock spellChecker; private Directory userindex, spellindex; + private final Random random = newRandom(); + private List searchers; @Override protected void setUp() throws Exception { @@ -56,10 +66,10 @@ public class TestSpellChecker extends TestCase { writer.addDocument(doc); } writer.close(); - + searchers = Collections.synchronizedList(new ArrayList()); // create the spellChecker spellindex = new RAMDirectory(); - spellChecker = new SpellChecker(spellindex); + spellChecker = new SpellCheckerMock(spellindex); } @@ -75,7 +85,9 @@ public class TestSpellChecker extends TestCase { int num_field2 = this.numdoc(); assertEquals(num_field2, num_field1 + 1); - + + assertLastSearcherOpen(4); + checkCommonSuggestions(r); checkLevenshteinSuggestions(r); @@ -201,4 +213,186 @@ public class TestSpellChecker extends TestCase { return num; } + public void testClose() throws IOException { + IndexReader r = IndexReader.open(userindex, true); + spellChecker.clearIndex(); + String field = "field1"; + addwords(r, "field1"); + int num_field1 = this.numdoc(); + addwords(r, "field2"); + int num_field2 = this.numdoc(); + assertEquals(num_field2, num_field1 + 1); + checkCommonSuggestions(r); + assertLastSearcherOpen(4); + spellChecker.close(); + assertSearchersClosed(); + try { + spellChecker.close(); + fail("spellchecker was already closed"); + } catch (AlreadyClosedException e) { + // expected + } + try { + checkCommonSuggestions(r); + fail("spellchecker was already closed"); + } catch (AlreadyClosedException e) { + // expected + } + + try { + spellChecker.clearIndex(); + fail("spellchecker was already closed"); + } catch (AlreadyClosedException e) { + // expected + } + + try { + spellChecker.indexDictionary(new LuceneDictionary(r, field)); + fail("spellchecker was already closed"); + } catch (AlreadyClosedException e) { + // expected + } + + try { + spellChecker.setSpellIndex(spellindex); + fail("spellchecker was already closed"); + } catch (AlreadyClosedException e) { + // expected + } + assertEquals(4, searchers.size()); + assertSearchersClosed(); + } + + /* + * tests if the internally shared indexsearcher is correctly closed + * when the spellchecker is concurrently accessed and closed. + */ + public void testConcurrentAccess() throws IOException, InterruptedException { + assertEquals(1, searchers.size()); + final IndexReader r = IndexReader.open(userindex, true); + spellChecker.clearIndex(); + assertEquals(2, searchers.size()); + addwords(r, "field1"); + assertEquals(3, searchers.size()); + int num_field1 = this.numdoc(); + addwords(r, "field2"); + assertEquals(4, searchers.size()); + int num_field2 = this.numdoc(); + assertEquals(num_field2, num_field1 + 1); + int numThreads = 5 + this.random.nextInt(5); + ExecutorService executor = Executors.newFixedThreadPool(numThreads); + SpellCheckWorker[] workers = new SpellCheckWorker[numThreads]; + for (int i = 0; i < numThreads; i++) { + SpellCheckWorker spellCheckWorker = new SpellCheckWorker(r); + executor.execute(spellCheckWorker); + workers[i] = spellCheckWorker; + + } + int iterations = 5 + random.nextInt(5); + for (int i = 0; i < iterations; i++) { + Thread.sleep(100); + // concurrently reset the spell index + spellChecker.setSpellIndex(this.spellindex); + // for debug - prints the internal open searchers + // showSearchersOpen(); + } + + spellChecker.close(); + executor.shutdown(); + executor.awaitTermination(5, TimeUnit.SECONDS); + + + for (int i = 0; i < workers.length; i++) { + assertFalse(workers[i].failed); + assertTrue(workers[i].terminated); + } + // 4 searchers more than iterations + // 1. at creation + // 2. clearIndex() + // 2. and 3. during addwords + assertEquals(iterations + 4, searchers.size()); + assertSearchersClosed(); + + } + + private void assertLastSearcherOpen(int numSearchers) { + assertEquals(numSearchers, searchers.size()); + IndexSearcher[] searcherArray = searchers.toArray(new IndexSearcher[0]); + for (int i = 0; i < searcherArray.length; i++) { + if (i == searcherArray.length - 1) { + assertTrue("expected last searcher open but was closed", + searcherArray[i].getIndexReader().getRefCount() > 0); + } else { + assertFalse("expected closed searcher but was open - Index: " + i, + searcherArray[i].getIndexReader().getRefCount() > 0); + } + } + } + + private void assertSearchersClosed() { + for (IndexSearcher searcher : searchers) { + assertEquals(0, searcher.getIndexReader().getRefCount()); + } + } + + private void showSearchersOpen() { + int count = 0; + for (IndexSearcher searcher : searchers) { + if(searcher.getIndexReader().getRefCount() > 0) + ++count; + } + System.out.println(count); + } + + + private class SpellCheckWorker implements Runnable { + private final IndexReader reader; + boolean terminated = false; + boolean failed = false; + + SpellCheckWorker(IndexReader reader) { + super(); + this.reader = reader; + } + + public void run() { + try { + while (true) { + try { + checkCommonSuggestions(reader); + } catch (AlreadyClosedException e) { + + return; + } catch (Throwable e) { + + e.printStackTrace(); + failed = true; + return; + } + } + } finally { + terminated = true; + } + } + + } + + class SpellCheckerMock extends SpellChecker { + public SpellCheckerMock(Directory spellIndex) throws IOException { + super(spellIndex); + } + + public SpellCheckerMock(Directory spellIndex, StringDistance sd) + throws IOException { + super(spellIndex, sd); + } + + @Override + IndexSearcher createSearcher(Directory dir) throws IOException { + IndexSearcher searcher = super.createSearcher(dir); + TestSpellChecker.this.searchers.add(searcher); + return searcher; + } + } + }