From e2a79a145fc26c63f7b1e708b4f563ef39588db1 Mon Sep 17 00:00:00 2001 From: Daniel Naber Date: Mon, 6 Jun 2005 22:29:30 +0000 Subject: [PATCH] new IndexModifier class that simplifies access to IndexReader and IndexWriter git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@185069 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 5 + .../apache/lucene/index/IndexModifier.java | 508 ++++++++++++++++++ .../org/apache/lucene/index/TestIndex.java | 256 +++++++++ 3 files changed, 769 insertions(+) create mode 100644 src/java/org/apache/lucene/index/IndexModifier.java create mode 100644 src/test/org/apache/lucene/index/TestIndex.java diff --git a/CHANGES.txt b/CHANGES.txt index 0a7cbe6b852..005fdba2955 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -112,6 +112,11 @@ New features 17. Add IntParser and FloatParser interfaces to FieldCache, so that fields in arbitrarily formats can be cached as ints and floats. (Doug Cutting) + +18. Added class org.apache.lucene.index.IndexModifier which combines + IndexWriter and IndexReader, so you can add and delete documents without + worrying about synchronisation/locking issues. + (Daniel Naber) API Changes diff --git a/src/java/org/apache/lucene/index/IndexModifier.java b/src/java/org/apache/lucene/index/IndexModifier.java new file mode 100644 index 00000000000..6bda2cc367c --- /dev/null +++ b/src/java/org/apache/lucene/index/IndexModifier.java @@ -0,0 +1,508 @@ +package org.apache.lucene.index; + +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.io.IOException; +import java.io.PrintStream; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; + +/** + * A class to modify an index, i.e. to delete and add documents. This + * class hides {@link IndexReader} and {@link IndexWriter} so that you + * do not need to care about implementation details such as that adding + * documents is done via IndexWriter and deletion is done via IndexReader. + * + *

Note that you cannot create more than one IndexModifier object + * on the same directory at the same time. + * + *

Example usage: + * + + + + +

+ + + + + + +
+ +    Analyzer analyzer = new StandardAnalyzer();
+    // create an index in /tmp/index, overwriting an existing one:
+    IndexModifier indexModifier = new IndexModifier("/tmp/index", analyzer, true);
+    Document doc = new Document();
+    doc.add(new Field("id""1", Field.Store.YES, Field.Index.UN_TOKENIZED));
+    doc.add(new Field("body""a simple test", Field.Store.YES, Field.Index.TOKENIZED));
+    indexModifier.addDocument(doc);
+    int deleted = indexModifier.delete(new Term("id""1"));
+    System.out.println("Deleted " + deleted + " document");
+    indexModifier.flush();
+    System.out.println(indexModifier.docCount() " docs in index");
+    indexModifier.close();
+ +
+
+ + + * + *

Not all methods of IndexReader and IndexWriter are offered by this + * class. If you need access to additional methods, either use those classes + * directly or implement your own class that extends IndexModifier. + * + *

Although an instance of this class can be used from more than one + * thread, you will not get the best performance. You might want to use + * IndexReader and IndexWriter directly for that (but you will need to + * care about synchronization yourself then). + * + *

While you can freely mix calls to add() and delete() using this class, + * you should batch you calls for best performance. For example, if you + * want to update 20 documents, you should first delete all those documents, + * then add all the new documents. + * + * @author Daniel Naber + */ +public class IndexModifier { + + protected IndexWriter indexWriter = null; + protected IndexReader indexReader = null; + + protected Directory directory = null; + protected Analyzer analyzer = null; + protected boolean open = false; + + // Lucene defaults: + protected PrintStream infoStream = null; + protected boolean useCompoundFile = true; + protected int maxBufferedDocs = IndexWriter.DEFAULT_MIN_MERGE_DOCS; + protected int maxFieldLength = IndexWriter.DEFAULT_MAX_FIELD_LENGTH; + protected int mergeFactor = IndexWriter.DEFAULT_MERGE_FACTOR; + + /** + * Open an index with write access. + * + * @param directory the index directory + * @param analyzer the analyzer to use for adding new documents + * @param create true to create the index or overwrite the existing one; + * false to append to the existing index + */ + public IndexModifier(Directory directory, Analyzer analyzer, boolean create) throws IOException { + init(directory, analyzer, create); + } + + /** + * Open an index with write access. + * + * @param dirName the index directory + * @param analyzer the analyzer to use for adding new documents + * @param create true to create the index or overwrite the existing one; + * false to append to the existing index + */ + public IndexModifier(String dirName, Analyzer analyzer, boolean create) throws IOException { + Directory dir = FSDirectory.getDirectory(dirName, create); + init(dir, analyzer, create); + } + + /** + * Open an index with write access. + * + * @param file the index directory + * @param analyzer the analyzer to use for adding new documents + * @param create true to create the index or overwrite the existing one; + * false to append to the existing index + */ + public IndexModifier(File file, Analyzer analyzer, boolean create) throws IOException { + Directory dir = FSDirectory.getDirectory(file, create); + init(dir, analyzer, create); + } + + /** + * Initialize an IndexWriter. + * @throws IOException + */ + protected void init(Directory directory, Analyzer analyzer, boolean create) throws IOException { + this.directory = directory; + synchronized(this.directory) { + this.analyzer = analyzer; + indexWriter = new IndexWriter(directory, analyzer, create); + open = true; + } + } + + /** + * Throw an IllegalStateException if the index is closed. + * @throws IllegalStateException + */ + protected void assureOpen() { + if (!open) { + throw new IllegalStateException("Index is closed"); + } + } + + /** + * Close the IndexReader and open an IndexWriter. + * @throws IOException + */ + protected void createIndexWriter() throws IOException { + if (indexWriter == null) { + if (indexReader != null) { + indexReader.close(); + indexReader = null; + } + indexWriter = new IndexWriter(directory, analyzer, false); + indexWriter.setInfoStream(infoStream); + indexWriter.setUseCompoundFile(useCompoundFile); + indexWriter.setMaxBufferedDocs(maxBufferedDocs); + indexWriter.setMaxFieldLength(maxFieldLength); + indexWriter.setMergeFactor(mergeFactor); + } + } + + /** + * Close the IndexWriter and open an IndexReader. + * @throws IOException + */ + protected void createIndexReader() throws IOException { + if (indexReader == null) { + if (indexWriter != null) { + indexWriter.close(); + indexWriter = null; + } + indexReader = IndexReader.open(directory); + } + } + + /** + * Make sure all changes are written to disk. + * @throws IOException + */ + public void flush() throws IOException { + synchronized(directory) { + assureOpen(); + if (indexWriter != null) { + indexWriter.close(); + indexWriter = null; + createIndexWriter(); + } else { + indexReader.close(); + indexReader = null; + createIndexReader(); + } + } + } + + /** + * Adds a document to this index, using the provided analyzer instead of the + * one specific in the constructor. If the document contains more than + * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are + * discarded. + * @see IndexWriter#addDocument(Document, Analyzer) + * @throws IllegalStateException if the index is closed + */ + public void addDocument(Document doc, Analyzer docAnalyzer) throws IOException { + synchronized(directory) { + assureOpen(); + createIndexWriter(); + if (docAnalyzer != null) + indexWriter.addDocument(doc, docAnalyzer); + else + indexWriter.addDocument(doc); + } + } + + /** + * Adds a document to this index. If the document contains more than + * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are + * discarded. + * @see IndexWriter#addDocument(Document) + * @throws IllegalStateException if the index is closed + */ + public void addDocument(Document doc) throws IOException { + addDocument(doc, null); + } + + /** + * Deletes all documents containing term. + * This is useful if one uses a document field to hold a unique ID string for + * the document. Then to delete such a document, one merely constructs a + * term with the appropriate field and the unique ID string as its text and + * passes it to this method. Returns the number of documents deleted. + * @return the number of documents deleted + * @see IndexReader#delete(Term) + * @throws IllegalStateException if the index is closed + */ + public int delete(Term term) throws IOException { + synchronized(directory) { + assureOpen(); + createIndexReader(); + return indexReader.delete(term); + } + } + + /** + * Deletes the document numbered docNum. + * @see IndexReader#delete(int) + * @throws IllegalStateException if the index is closed + */ + public void delete(int docNum) throws IOException { + synchronized(directory) { + assureOpen(); + createIndexReader(); + indexReader.delete(docNum); + } + } + + /** + * Returns the number of documents currently in this index. + * @see IndexWriter#docCount() + * @see IndexReader#numDocs() + * @throws IllegalStateException if the index is closed + */ + public int docCount() { + synchronized(directory) { + assureOpen(); + if (indexWriter != null) { + return indexWriter.docCount(); + } else { + return indexReader.numDocs(); + } + } + } + + /** + * Merges all segments together into a single segment, optimizing an index + * for search. + * @see IndexWriter#optimize() + * @throws IllegalStateException if the index is closed + */ + public void optimize() throws IOException { + synchronized(directory) { + assureOpen(); + createIndexWriter(); + indexWriter.optimize(); + } + } + + /** + * If non-null, information about merges and a message when + * {@link #getMaxFieldLength()} is reached will be printed to this. + *

Example: index.setInfoStream(System.err); + * @see IndexWriter#setInfoStream(PrintStream) + * @throws IllegalStateException if the index is closed + */ + public void setInfoStream(PrintStream infoStream) throws IOException { + synchronized(directory) { + assureOpen(); + createIndexWriter(); + indexWriter.setInfoStream(infoStream); + this.infoStream = infoStream; + } + } + + /** + * @throws IOException + * @see IndexModifier#setInfoStream(PrintStream) + */ + public PrintStream getInfoStream() throws IOException { + synchronized(directory) { + assureOpen(); + createIndexWriter(); + return indexWriter.getInfoStream(); + } + } + + /** + * Setting to turn on usage of a compound file. When on, multiple files + * for each segment are merged into a single file once the segment creation + * is finished. This is done regardless of what directory is in use. + * @see IndexWriter#setUseCompoundFile(boolean) + * @throws IllegalStateException if the index is closed + */ + public void setUseCompoundFile(boolean useCompoundFile) throws IOException { + synchronized(directory) { + assureOpen(); + createIndexWriter(); + indexWriter.setUseCompoundFile(useCompoundFile); + this.useCompoundFile = useCompoundFile; + } + } + + /** + * @throws IOException + * @see IndexModifier#setUseCompoundFile(boolean) + */ + public boolean getUseCompoundFile() throws IOException { + synchronized(directory) { + assureOpen(); + createIndexWriter(); + return indexWriter.getUseCompoundFile(); + } + } + + /** + * The maximum number of terms that will be indexed for a single field in a + * document. This limits the amount of memory required for indexing, so that + * collections with very large files will not crash the indexing process by + * running out of memory.

+ * Note that this effectively truncates large documents, excluding from the + * index terms that occur further in the document. If you know your source + * documents are large, be sure to set this value high enough to accomodate + * the expected size. If you set it to Integer.MAX_VALUE, then the only limit + * is your memory, but you should anticipate an OutOfMemoryError.

+ * By default, no more than 10,000 terms will be indexed for a field. + * @see IndexWriter#setMaxFieldLength(int) + * @throws IllegalStateException if the index is closed + */ + public void setMaxFieldLength(int maxFieldLength) throws IOException { + synchronized(directory) { + assureOpen(); + createIndexWriter(); + indexWriter.setMaxFieldLength(maxFieldLength); + this.maxFieldLength = maxFieldLength; + } + } + + /** + * @throws IOException + * @see IndexModifier#setMaxFieldLength(int) + */ + public int getMaxFieldLength() throws IOException { + synchronized(directory) { + assureOpen(); + createIndexWriter(); + return indexWriter.getMaxFieldLength(); + } + } + + /** + * The maximum number of terms that will be indexed for a single field in a + * document. This limits the amount of memory required for indexing, so that + * collections with very large files will not crash the indexing process by + * running out of memory.

+ * Note that this effectively truncates large documents, excluding from the + * index terms that occur further in the document. If you know your source + * documents are large, be sure to set this value high enough to accomodate + * the expected size. If you set it to Integer.MAX_VALUE, then the only limit + * is your memory, but you should anticipate an OutOfMemoryError.

+ * By default, no more than 10,000 terms will be indexed for a field. + * @see IndexWriter#setMaxBufferedDocs(int) + * @throws IllegalStateException if the index is closed + */ + public void setMaxBufferedDocs(int maxBufferedDocs) throws IOException { + synchronized(directory) { + assureOpen(); + createIndexWriter(); + indexWriter.setMaxBufferedDocs(maxBufferedDocs); + this.maxBufferedDocs = maxBufferedDocs; + } + } + + /** + * @throws IOException + * @see IndexModifier#setMaxBufferedDocs(int) + */ + public int getMaxBufferedDocs() throws IOException { + synchronized(directory) { + assureOpen(); + createIndexWriter(); + return indexWriter.getMaxBufferedDocs(); + } + } + + /** + * Determines how often segment indices are merged by addDocument(). With + * smaller values, less RAM is used while indexing, and searches on + * unoptimized indices are faster, but indexing speed is slower. With larger + * values, more RAM is used during indexing, and while searches on unoptimized + * indices are slower, indexing is faster. Thus larger values (> 10) are best + * for batch index creation, and smaller values (< 10) for indices that are + * interactively maintained. + *

This must never be less than 2. The default value is 10. + * + * @see IndexWriter#setMergeFactor(int) + * @throws IllegalStateException if the index is closed + */ + public void setMergeFactor(int mergeFactor) throws IOException { + synchronized(directory) { + assureOpen(); + createIndexWriter(); + indexWriter.setMergeFactor(mergeFactor); + this.mergeFactor = mergeFactor; + } + } + + /** + * @throws IOException + * @see IndexModifier#setMergeFactor(int) + */ + public int getMergeFactor() throws IOException { + synchronized(directory) { + assureOpen(); + createIndexWriter(); + return indexWriter.getMergeFactor(); + } + } + + /** + * Close this index, writing all pending changes to disk. + * + * @throws IllegalStateException if the index has been closed before already + */ + public void close() throws IOException { + synchronized(directory) { + if (!open) + throw new IllegalStateException("Index is closed already"); + if (indexWriter != null) { + indexWriter.close(); + indexWriter = null; + } else { + indexReader.close(); + indexReader = null; + } + open = false; + } + } + + public String toString() { + return "Index@" + directory; + } + + /* + // used as an example in the javadoc: + public static void main(String[] args) throws IOException { + Analyzer analyzer = new StandardAnalyzer(); + // create an index in /tmp/index, overwriting an existing one: + IndexModifier indexModifier = new IndexModifier("/tmp/index", analyzer, true); + Document doc = new Document(); + doc.add(new Field("id", "1", Field.Store.YES, Field.Index.UN_TOKENIZED)); + doc.add(new Field("body", "a simple test", Field.Store.YES, Field.Index.TOKENIZED)); + indexModifier.addDocument(doc); + int deleted = indexModifier.delete(new Term("id", "1")); + System.out.println("Deleted " + deleted + " document"); + indexModifier.flush(); + System.out.println(indexModifier.docCount() + " docs in index"); + indexModifier.close(); + }*/ + +} diff --git a/src/test/org/apache/lucene/index/TestIndex.java b/src/test/org/apache/lucene/index/TestIndex.java new file mode 100644 index 00000000000..53224d7c74c --- /dev/null +++ b/src/test/org/apache/lucene/index/TestIndex.java @@ -0,0 +1,256 @@ +package org.apache.lucene.index; + +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.io.IOException; +import java.util.Random; +import java.util.Stack; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.SimpleAnalyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.RAMDirectory; + +/** + * Tests for the "Index" class, including accesses from two threads at the + * same time. + * + * @author Daniel Naber + */ +public class TestIndex extends TestCase { + + private final int ITERATIONS = 500; // iterations of thread test + + private int docCount = 0; + + private final Term allDocTerm = new Term("all", "x"); + + public void testIndex() throws IOException { + Directory ramDir = new RAMDirectory(); + IndexModifier i = new IndexModifier(ramDir, new StandardAnalyzer(), true); + i.addDocument(getDoc()); + assertEquals(1, i.docCount()); + i.flush(); + i.addDocument(getDoc(), new SimpleAnalyzer()); + assertEquals(2, i.docCount()); + i.optimize(); + assertEquals(2, i.docCount()); + i.flush(); + i.delete(0); + assertEquals(1, i.docCount()); + i.flush(); + assertEquals(1, i.docCount()); + i.addDocument(getDoc()); + i.addDocument(getDoc()); + i.flush(); + assertEquals(3, i.docCount()); + i.delete(allDocTerm); + assertEquals(0, i.docCount()); + i.optimize(); + assertEquals(0, i.docCount()); + + // Lucene defaults: + assertNull(i.getInfoStream()); + assertTrue(i.getUseCompoundFile()); + assertEquals(10, i.getMaxBufferedDocs()); + assertEquals(10000, i.getMaxFieldLength()); + assertEquals(10, i.getMergeFactor()); + i.setMaxBufferedDocs(100); + i.setMergeFactor(25); + i.setMaxFieldLength(250000); + i.addDocument(getDoc()); + i.setUseCompoundFile(false); + i.flush(); + assertEquals(100, i.getMaxBufferedDocs()); + assertEquals(25, i.getMergeFactor()); + assertEquals(250000, i.getMaxFieldLength()); + assertFalse(i.getUseCompoundFile()); + + i.close(); + try { + i.docCount(); + fail(); + } catch (IllegalStateException e) { + // expected exception + } + } + + public void testExtendedIndex() throws IOException { + Directory ramDir = new RAMDirectory(); + PowerIndex powerIndex = new PowerIndex(ramDir, new StandardAnalyzer(), true); + powerIndex.addDocument(getDoc()); + powerIndex.addDocument(getDoc()); + powerIndex.addDocument(getDoc()); + powerIndex.addDocument(getDoc()); + powerIndex.addDocument(getDoc()); + powerIndex.flush(); + assertEquals(5, powerIndex.docFreq(allDocTerm)); + powerIndex.close(); + } + + private Document getDoc() { + Document doc = new Document(); + doc.add(new Field("body", new Integer(docCount).toString(), Field.Store.YES, Field.Index.UN_TOKENIZED)); + doc.add(new Field("all", "x", Field.Store.YES, Field.Index.UN_TOKENIZED)); + docCount++; + return doc; + } + + public void testIndexWithThreads() throws IOException { + testIndexInternal(0); + testIndexInternal(10); + testIndexInternal(50); + } + + private void testIndexInternal(int maxWait) throws IOException { + boolean create = true; + //Directory rd = new RAMDirectory(); + // work on disk to make sure potential lock problems are tested: + String tempDir = System.getProperty("java.io.tmpdir"); + if (tempDir == null) + throw new IOException("java.io.tmpdir undefined, cannot run test"); + File indexDir = new File(tempDir, "lucenetestindex"); + Directory rd = FSDirectory.getDirectory(indexDir, create); + IndexModifier index = new IndexModifier(rd, new StandardAnalyzer(), create); + IndexThread thread1 = new IndexThread(index, maxWait); + thread1.start(); + IndexThread thread2 = new IndexThread(index, maxWait); + thread2.start(); + while(thread1.isAlive() || thread2.isAlive()) { + try { + Thread.sleep(100); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + index.optimize(); + int added = thread1.added + thread2.added; + int deleted = thread1.deleted + thread2.deleted; + assertEquals(added-deleted, index.docCount()); + index.close(); + + try { + index.close(); + fail(); + } catch(IllegalStateException e) { + // expected exception + } + rmDir(indexDir); + } + + private void rmDir(File dir) { + File[] files = dir.listFiles(); + for (int i = 0; i < files.length; i++) { + files[i].delete(); + } + dir.delete(); + } + + private int id = 0; + private Stack idStack = new Stack(); + // TODO: test case is not reproducible despite pseudo-random numbers + // used for anything: + private Random random = new Random(101); // constant seed for reproducability + + private class PowerIndex extends IndexModifier { + public PowerIndex(Directory dir, Analyzer analyzer, boolean create) throws IOException { + super(dir, analyzer, create); + } + public int docFreq(Term term) throws IOException { + synchronized(directory) { + assureOpen(); + createIndexReader(); + return indexReader.docFreq(term); + } + } + } + + private class IndexThread extends Thread { + + private int maxWait = 10; + private IndexModifier index; + private int added = 0; + private int deleted = 0; + + IndexThread(IndexModifier index, int maxWait) { + this.index = index; + this.maxWait = maxWait; + id = 0; + idStack.clear(); + } + + public void run() { + try { + for(int i = 0; i < ITERATIONS; i++) { + int rand = random.nextInt(101); + if (rand < 5) { + index.optimize(); + } else if (rand < 60) { + Document doc = getDocument(); + //System.out.println("add doc id=" + doc.get("id")); + index.addDocument(doc); + idStack.push(doc.get("id")); + added++; + } else { + if (idStack.size() == 0) { + // not enough docs in index, let's wait for next chance + } else { + // we just delete the last document added and remove it + // from the id stack so that it won't be removed twice: + String delId = (String)idStack.pop(); + //System.out.println("delete doc id = " + delId); + index.delete(new Term("id", new Integer(delId).toString())); + deleted++; + } + } + if (maxWait > 0) { + try { + rand = random.nextInt(maxWait); + //System.out.println("waiting " + rand + "ms"); + Thread.sleep(rand); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private Document getDocument() { + Document doc = new Document(); + doc.add(new Field("id", new Integer(id++).toString(), Field.Store.YES, + Field.Index.UN_TOKENIZED)); + // add random stuff: + doc.add(new Field("content", new Integer(random.nextInt(1000)).toString(), Field.Store.YES, + Field.Index.TOKENIZED)); + doc.add(new Field("content", new Integer(random.nextInt(1000)).toString(), Field.Store.YES, + Field.Index.TOKENIZED)); + doc.add(new Field("all", "x", Field.Store.YES, Field.Index.TOKENIZED)); + return doc; + } + } + +}