From e2559e40038c08047cf7b4f992ca25a94ce185d1 Mon Sep 17 00:00:00 2001 From: Dmitry Serebrennikov Date: Thu, 25 Sep 2003 22:01:51 +0000 Subject: [PATCH] Implementation of compound files. This reduces the number of files used by Lucene to 1 per index segment (2 when deleted documents exist). Test cases modified and added to go with this code. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150067 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/index/CompoundFileReader.java | 247 ++++++ .../lucene/index/CompoundFileWriter.java | 210 ++++++ .../org/apache/lucene/index/IndexWriter.java | 113 +-- .../apache/lucene/index/SegmentMerger.java | 180 +++-- .../apache/lucene/index/SegmentReader.java | 83 ++- src/test/org/apache/lucene/TestSearch.java | 178 +++++ .../lucene/TestSearchForDuplicates.java | 190 +++++ .../org/apache/lucene/ThreadSafetyTest.java | 95 +-- src/test/org/apache/lucene/index/DocTest.java | 25 +- .../apache/lucene/index/TestCompoundFile.java | 701 ++++++++++++++++++ src/test/org/apache/lucene/index/TestDoc.java | 265 +++++++ .../org/apache/lucene/store/_TestHelper.java | 47 ++ 12 files changed, 2142 insertions(+), 192 deletions(-) create mode 100644 src/java/org/apache/lucene/index/CompoundFileReader.java create mode 100644 src/java/org/apache/lucene/index/CompoundFileWriter.java create mode 100644 src/test/org/apache/lucene/TestSearch.java create mode 100644 src/test/org/apache/lucene/TestSearchForDuplicates.java create mode 100644 src/test/org/apache/lucene/index/TestCompoundFile.java create mode 100644 src/test/org/apache/lucene/index/TestDoc.java create mode 100644 src/test/org/apache/lucene/store/_TestHelper.java diff --git a/src/java/org/apache/lucene/index/CompoundFileReader.java b/src/java/org/apache/lucene/index/CompoundFileReader.java new file mode 100644 index 00000000000..4558676f8e6 --- /dev/null +++ b/src/java/org/apache/lucene/index/CompoundFileReader.java @@ -0,0 +1,247 @@ +package org.apache.lucene.index; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.InputStream; +import org.apache.lucene.store.OutputStream; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.Lock; +import java.util.HashMap; +import java.util.Iterator; +import java.io.IOException; + + +/** Class for accessing a compound stream. + * This class implements a directory, but is limited to only read operations. + * Directory methods that would normally modify data throw an exception. + */ +public class CompoundFileReader extends Directory { + + private static final class FileEntry { + long offset; + long length; + }; + + + // Base info + private Directory directory; + private String fileName; + + // Reference count + private boolean open; + + private InputStream stream; + private HashMap entries = new HashMap(); + + + public CompoundFileReader(Directory dir, String name) + throws IOException + { + directory = dir; + fileName = name; + + boolean success = false; + + try { + stream = dir.openFile(name); + + // read the directory and init files + int count = stream.readVInt(); + FileEntry entry = null; + for (int i=0; i 0 && pos >= length) + throw new IOException("Seek past the end of file"); + + if (pos < 0) + throw new IOException("Seek to a negative offset"); + + base.seek(fileOffset + pos); + } + + /** Closes the stream to futher operations. */ + public void close() throws IOException + { + base.close(); + } + + + /** Returns a clone of this stream. + * + *

Clones of a stream access the same data, and are positioned at the same + * point as the stream they were cloned from. + * + *

Expert: Subclasses must ensure that clones may be positioned at + * different points in the input from each other and from the stream they + * were cloned from. + */ + public Object clone() { + CSInputStream other = (CSInputStream) super.clone(); + other.base = (InputStream) base.clone(); + return other; + } + } + + +} \ No newline at end of file diff --git a/src/java/org/apache/lucene/index/CompoundFileWriter.java b/src/java/org/apache/lucene/index/CompoundFileWriter.java new file mode 100644 index 00000000000..a48e90f81c6 --- /dev/null +++ b/src/java/org/apache/lucene/index/CompoundFileWriter.java @@ -0,0 +1,210 @@ +package org.apache.lucene.index; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.OutputStream; +import org.apache.lucene.store.InputStream; +import java.util.LinkedList; +import java.util.HashSet; +import java.util.Iterator; +import java.io.IOException; + + +/** Combines multiple files into a single compound file. + * The file format:
+ *

+ * + * The fileCount integer indicates how many files are contained in this compound + * file. The {directory} that follows has that many entries. Each directory entry + * contains an encoding identifier, an long pointer to the start of this file's + * data section, and a UTF String with that file's extension. + */ +final class CompoundFileWriter { + + private static final class FileEntry { + /** source file */ + String file; + + /** temporary holder for the start of directory entry for this file */ + long directoryOffset; + + /** temporary holder for the start of this file's data section */ + long dataOffset; + } + + + private Directory directory; + private String fileName; + private HashSet ids; + private LinkedList entries; + private boolean merged = false; + + + /** Create the compound stream in the specified file. The file name is the + * entire name (no extensions are added). + */ + public CompoundFileWriter(Directory dir, String name) { + if (dir == null) + throw new IllegalArgumentException("Missing directory"); + if (name == null) + throw new IllegalArgumentException("Missing name"); + + directory = dir; + fileName = name; + ids = new HashSet(); + entries = new LinkedList(); + } + + /** Returns the directory of the compound file. */ + public Directory getDirectory() { + return directory; + } + + /** Returns the name of the compound file. */ + public String getName() { + return fileName; + } + + + /** Add a source stream. If sourceDir is null, it is set to the + * same value as the directory where this compound stream exists. + * The id is the string by which the sub-stream will be know in the + * compound stream. The caller must ensure that the ID is unique. If the + * id is null, it is set to the name of the source file. + */ + public void addFile(String file) { + if (merged) + throw new IllegalStateException( + "Can't add extensions after merge has been called"); + + if (file == null) + throw new IllegalArgumentException( + "Missing source file"); + + if (! ids.add(file)) + throw new IllegalArgumentException( + "File " + file + " already added"); + + FileEntry entry = new FileEntry(); + entry.file = file; + entries.add(entry); + } + + /** Merge files with the extensions added up to now. + * All files with these extensions are combined sequentially into the + * compound stream. After successful merge, the source files + * are deleted. + */ + public void close() throws IOException { + if (merged) + throw new IllegalStateException( + "Merge already performed"); + + if (entries.isEmpty()) + throw new IllegalStateException( + "No entries to merge have been defined"); + + merged = true; + + // open the compound stream + OutputStream os = null; + try { + os = directory.createFile(fileName); + + // Write the number of entries + os.writeVInt(entries.size()); + + // Write the directory with all offsets at 0. + // Remember the positions of directory entries so that we can + // adjust the offsets later + Iterator it = entries.iterator(); + while(it.hasNext()) { + FileEntry fe = (FileEntry) it.next(); + fe.directoryOffset = os.getFilePointer(); + os.writeLong(0); // for now + os.writeString(fe.file); + } + + // Open the files and copy their data into the stream. + // Remeber the locations of each file's data section. + byte buffer[] = new byte[1024]; + it = entries.iterator(); + while(it.hasNext()) { + FileEntry fe = (FileEntry) it.next(); + fe.dataOffset = os.getFilePointer(); + copyFile(fe, os, buffer); + } + + // Write the data offsets into the directory of the compound stream + it = entries.iterator(); + while(it.hasNext()) { + FileEntry fe = (FileEntry) it.next(); + os.seek(fe.directoryOffset); + os.writeLong(fe.dataOffset); + } + + // Close the output stream. Set the os to null before trying to + // close so that if an exception occurs during the close, the + // finally clause below will not attempt to close the stream + // the second time. + OutputStream tmp = os; + os = null; + tmp.close(); + + } finally { + if (os != null) try { os.close(); } catch (IOException e) { } + } + } + + + /** Copy the contents of the file with specified extension into the + * provided output stream. Use the provided buffer for moving data + * to reduce memory allocation. + */ + private void copyFile(FileEntry source, OutputStream os, byte buffer[]) + throws IOException + { + InputStream is = null; + try { + long startPtr = os.getFilePointer(); + + is = directory.openFile(source.file); + long length = is.length(); + long remainder = length; + int chunk = buffer.length; + + while(remainder > 0) { + int len = (int) Math.min(chunk, remainder); + is.readBytes(buffer, 0, len); + os.writeBytes(buffer, len); + remainder -= len; + } + + // Verify that remainder is 0 + if (remainder != 0) + throw new IOException( + "Non-zero remainder length after copying: " + remainder + + " (id: " + source.file + ", length: " + length + + ", buffer size: " + chunk + ")"); + + // Verify that the output length diff is equal to original file + long endPtr = os.getFilePointer(); + long diff = endPtr - startPtr; + if (diff != length) + throw new IOException( + "Difference in the output file offsets " + diff + + " does not match the original file length " + length); + + } finally { + if (is != null) is.close(); + } + } +} \ No newline at end of file diff --git a/src/java/org/apache/lucene/index/IndexWriter.java b/src/java/org/apache/lucene/index/IndexWriter.java index e09ac3b2203..327bb65abbb 100644 --- a/src/java/org/apache/lucene/index/IndexWriter.java +++ b/src/java/org/apache/lucene/index/IndexWriter.java @@ -100,6 +100,29 @@ public class IndexWriter { private Lock writeLock; + /** Use compound file setting. Defaults to false to maintain multiple files + * per segment behavior. + */ + private boolean useCompoundFile = false; + + + /** Setting to turn on usage of a compound file. When on, multiple files + * for each segment are merged into a single file once the segment creation + * is finished. This is done regardless of what directory is in use. + */ + public boolean getUseCompoundFile() { + return useCompoundFile; + } + + /** Setting to turn on usage of a compound file. When on, multiple files + * for each segment are merged into a single file once the segment creation + * is finished. This is done regardless of what directory is in use. + */ + public void setUseCompoundFile(boolean value) { + useCompoundFile = value; + } + + /** Expert: Set the Similarity implementation used by this IndexWriter. * * @see Similarity#setDefault(Similarity) @@ -150,14 +173,14 @@ public class IndexWriter { synchronized (directory) { // in- & inter-process sync new Lock.With(directory.makeLock("commit.lock"), COMMIT_LOCK_TIMEOUT) { - public Object doBody() throws IOException { - if (create) - segmentInfos.write(directory); - else - segmentInfos.read(directory); - return null; - } - }.run(); + public Object doBody() throws IOException { + if (create) + segmentInfos.write(directory); + else + segmentInfos.read(directory); + return null; + } + }.run(); } } @@ -266,12 +289,14 @@ public class IndexWriter { public synchronized void optimize() throws IOException { flushRamSegments(); while (segmentInfos.size() > 1 || - (segmentInfos.size() == 1 && - (SegmentReader.hasDeletions(segmentInfos.info(0)) || - segmentInfos.info(0).dir != directory))) { + (segmentInfos.size() == 1 && + (SegmentReader.hasDeletions(segmentInfos.info(0)) || + (useCompoundFile && + !SegmentReader.usesCompoundFile(segmentInfos.info(0))) || + segmentInfos.info(0).dir != directory))) { int minSegment = segmentInfos.size() - mergeFactor; mergeSegments(minSegment < 0 ? 0 : minSegment); - } + } } /** Merges all segments from an array of indexes into this index. @@ -290,7 +315,7 @@ public class IndexWriter { SegmentInfos sis = new SegmentInfos(); // read infos from dir sis.read(dirs[i]); for (int j = 0; j < sis.size(); j++) { - segmentInfos.addElement(sis.info(j)); // add each info + segmentInfos.addElement(sis.info(j)); // add each info } } optimize(); // final cleanup @@ -301,13 +326,13 @@ public class IndexWriter { int minSegment = segmentInfos.size()-1; int docCount = 0; while (minSegment >= 0 && - (segmentInfos.info(minSegment)).dir == ramDirectory) { + (segmentInfos.info(minSegment)).dir == ramDirectory) { docCount += segmentInfos.info(minSegment).docCount; minSegment--; } if (minSegment < 0 || // add one FS segment? - (docCount + segmentInfos.info(minSegment).docCount) > mergeFactor || - !(segmentInfos.info(segmentInfos.size()-1).dir == ramDirectory)) + (docCount + segmentInfos.info(minSegment).docCount) > mergeFactor || + !(segmentInfos.info(segmentInfos.size()-1).dir == ramDirectory)) minSegment++; if (minSegment >= segmentInfos.size()) return; // none to merge @@ -322,16 +347,16 @@ public class IndexWriter { int minSegment = segmentInfos.size(); int mergeDocs = 0; while (--minSegment >= 0) { - SegmentInfo si = segmentInfos.info(minSegment); - if (si.docCount >= targetMergeDocs) - break; - mergeDocs += si.docCount; + SegmentInfo si = segmentInfos.info(minSegment); + if (si.docCount >= targetMergeDocs) + break; + mergeDocs += si.docCount; } if (mergeDocs >= targetMergeDocs) // found a merge to do - mergeSegments(minSegment+1); + mergeSegments(minSegment+1); else - break; + break; targetMergeDocs *= mergeFactor; // increase target size } @@ -344,17 +369,19 @@ public class IndexWriter { String mergedName = newSegmentName(); int mergedDocCount = 0; if (infoStream != null) infoStream.print("merging segments"); - SegmentMerger merger = new SegmentMerger(directory, mergedName); + SegmentMerger merger = + new SegmentMerger(directory, mergedName, useCompoundFile); + final Vector segmentsToDelete = new Vector(); for (int i = minSegment; i < segmentInfos.size(); i++) { SegmentInfo si = segmentInfos.info(i); if (infoStream != null) - infoStream.print(" " + si.name + " (" + si.docCount + " docs)"); + infoStream.print(" " + si.name + " (" + si.docCount + " docs)"); SegmentReader reader = new SegmentReader(si); merger.add(reader); if ((reader.directory == this.directory) || // if we own the directory (reader.directory == this.ramDirectory)) - segmentsToDelete.addElement(reader); // queue segment for deletion + segmentsToDelete.addElement(reader); // queue segment for deletion mergedDocCount += reader.numDocs(); } if (infoStream != null) { @@ -362,19 +389,19 @@ public class IndexWriter { infoStream.println(" into "+mergedName+" ("+mergedDocCount+" docs)"); } merger.merge(); - + segmentInfos.setSize(minSegment); // pop old infos & add new segmentInfos.addElement(new SegmentInfo(mergedName, mergedDocCount, - directory)); + directory)); synchronized (directory) { // in- & inter-process sync new Lock.With(directory.makeLock("commit.lock"), COMMIT_LOCK_TIMEOUT) { - public Object doBody() throws IOException { - segmentInfos.write(directory); // commit before deleting - deleteSegments(segmentsToDelete); // delete now-unused segments - return null; - } - }.run(); + public Object doBody() throws IOException { + segmentInfos.write(directory); // commit before deleting + deleteSegments(segmentsToDelete); // delete now-unused segments + return null; + } + }.run(); } } @@ -391,9 +418,9 @@ public class IndexWriter { for (int i = 0; i < segments.size(); i++) { SegmentReader reader = (SegmentReader)segments.elementAt(i); if (reader.directory == this.directory) - deleteFiles(reader.files(), deletable); // try to delete our files + deleteFiles(reader.files(), deletable); // try to delete our files else - deleteFiles(reader.files(), reader.directory); // delete, eg, RAM files + deleteFiles(reader.files(), reader.directory); // delete, eg, RAM files } writeDeleteableFiles(deletable); // note files we can't delete @@ -410,13 +437,13 @@ public class IndexWriter { for (int i = 0; i < files.size(); i++) { String file = (String)files.elementAt(i); try { - directory.deleteFile(file); // try to delete each file + directory.deleteFile(file); // try to delete each file } catch (IOException e) { // if delete fails - if (directory.fileExists(file)) { - if (infoStream != null) - infoStream.println(e.getMessage() + "; Will re-try later."); - deletable.addElement(file); // add to deletable - } + if (directory.fileExists(file)) { + if (infoStream != null) + infoStream.println(e.getMessage() + "; Will re-try later."); + deletable.addElement(file); // add to deletable + } } } } @@ -429,7 +456,7 @@ public class IndexWriter { InputStream input = directory.openFile("deletable"); try { for (int i = input.readInt(); i > 0; i--) // read file names - result.addElement(input.readString()); + result.addElement(input.readString()); } finally { input.close(); } @@ -441,7 +468,7 @@ public class IndexWriter { try { output.writeInt(files.size()); for (int i = 0; i < files.size(); i++) - output.writeString((String)files.elementAt(i)); + output.writeString((String)files.elementAt(i)); } finally { output.close(); } diff --git a/src/java/org/apache/lucene/index/SegmentMerger.java b/src/java/org/apache/lucene/index/SegmentMerger.java index 81a48bbad68..d7b5517633e 100644 --- a/src/java/org/apache/lucene/index/SegmentMerger.java +++ b/src/java/org/apache/lucene/index/SegmentMerger.java @@ -55,6 +55,8 @@ package org.apache.lucene.index; */ import java.util.Vector; +import java.util.ArrayList; +import java.util.Iterator; import java.io.IOException; import org.apache.lucene.store.Directory; @@ -63,15 +65,17 @@ import org.apache.lucene.store.InputStream; import org.apache.lucene.util.BitVector; final class SegmentMerger { + private boolean useCompoundFile; private Directory directory; private String segment; private Vector readers = new Vector(); private FieldInfos fieldInfos; - SegmentMerger(Directory dir, String name) { + SegmentMerger(Directory dir, String name, boolean compoundFile) { directory = dir; segment = name; + useCompoundFile = compoundFile; } final void add(SegmentReader reader) { @@ -90,12 +94,62 @@ final class SegmentMerger { } finally { for (int i = 0; i < readers.size(); i++) { // close readers - SegmentReader reader = (SegmentReader)readers.elementAt(i); - reader.close(); + SegmentReader reader = (SegmentReader)readers.elementAt(i); + reader.close(); } } + + if (useCompoundFile) + createCompoundFile(); } + + // Add the fixed files + private final String COMPOUND_EXTENSIONS[] = new String[] { + "fnm", "frq", "prx", "fdx", "fdt", "tii", "tis" + }; + + + private final void createCompoundFile() + throws IOException + { + CompoundFileWriter oneWriter = + new CompoundFileWriter(directory, segment + ".cfs"); + + ArrayList files = + new ArrayList(COMPOUND_EXTENSIONS.length + fieldInfos.size()); + + // Basic files + for (int i=0; i 0) { - SegmentMergeInfo smi = match[--matchSize]; - if (smi.next()) - queue.put(smi); // restore queue - else - smi.close(); // done with a segment + SegmentMergeInfo smi = match[--matchSize]; + if (smi.next()) + queue.put(smi); // restore queue + else + smi.close(); // done with a segment } } } @@ -209,34 +263,34 @@ final class SegmentMerger { smi.termEnum.termInfo(termInfo); postings.seek(termInfo); while (postings.next()) { - int doc; - if (docMap == null) - doc = base + postings.doc; // no deletions - else - doc = base + docMap[postings.doc]; // re-map around deletions + int doc; + if (docMap == null) + doc = base + postings.doc; // no deletions + else + doc = base + docMap[postings.doc]; // re-map around deletions - if (doc < lastDoc) - throw new IllegalStateException("docs out of order"); + if (doc < lastDoc) + throw new IllegalStateException("docs out of order"); - int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1 - lastDoc = doc; + int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1 + lastDoc = doc; - int freq = postings.freq; - if (freq == 1) { - freqOutput.writeVInt(docCode | 1); // write doc & freq=1 - } else { - freqOutput.writeVInt(docCode); // write doc - freqOutput.writeVInt(freq); // write frequency in doc - } - - int lastPosition = 0; // write position deltas - for (int j = 0; j < freq; j++) { - int position = postings.nextPosition(); - proxOutput.writeVInt(position - lastPosition); - lastPosition = position; - } + int freq = postings.freq; + if (freq == 1) { + freqOutput.writeVInt(docCode | 1); // write doc & freq=1 + } else { + freqOutput.writeVInt(docCode); // write doc + freqOutput.writeVInt(freq); // write frequency in doc + } + + int lastPosition = 0; // write position deltas + for (int j = 0; j < freq; j++) { + int position = postings.nextPosition(); + proxOutput.writeVInt(position - lastPosition); + lastPosition = position; + } - df++; + df++; } } return df; @@ -246,27 +300,27 @@ final class SegmentMerger { for (int i = 0; i < fieldInfos.size(); i++) { FieldInfo fi = fieldInfos.fieldInfo(i); if (fi.isIndexed) { - OutputStream output = directory.createFile(segment + ".f" + i); - try { - for (int j = 0; j < readers.size(); j++) { - SegmentReader reader = (SegmentReader)readers.elementAt(j); - BitVector deletedDocs = reader.deletedDocs; - InputStream input = reader.normStream(fi.name); + OutputStream output = directory.createFile(segment + ".f" + i); + try { + for (int j = 0; j < readers.size(); j++) { + SegmentReader reader = (SegmentReader)readers.elementAt(j); + BitVector deletedDocs = reader.deletedDocs; + InputStream input = reader.normStream(fi.name); int maxDoc = reader.maxDoc(); - try { - for (int k = 0; k < maxDoc; k++) { - byte norm = input != null ? input.readByte() : (byte)0; - if (deletedDocs == null || !deletedDocs.get(k)) - output.writeByte(norm); - } - } finally { - if (input != null) - input.close(); - } - } - } finally { - output.close(); - } + try { + for (int k = 0; k < maxDoc; k++) { + byte norm = input != null ? input.readByte() : (byte)0; + if (deletedDocs == null || !deletedDocs.get(k)) + output.writeByte(norm); + } + } finally { + if (input != null) + input.close(); + } + } + } finally { + output.close(); + } } } } diff --git a/src/java/org/apache/lucene/index/SegmentReader.java b/src/java/org/apache/lucene/index/SegmentReader.java index d8513543d6b..83d0924afdc 100644 --- a/src/java/org/apache/lucene/index/SegmentReader.java +++ b/src/java/org/apache/lucene/index/SegmentReader.java @@ -65,6 +65,7 @@ import java.util.Vector; import org.apache.lucene.document.Document; import org.apache.lucene.store.InputStream; import org.apache.lucene.store.Lock; +import org.apache.lucene.store.Directory; import org.apache.lucene.util.BitVector; final class SegmentReader extends IndexReader { @@ -81,7 +82,9 @@ final class SegmentReader extends IndexReader { InputStream freqStream; InputStream proxStream; - + + // Compound File Reader when based on a compound file segment + CompoundFileReader cfsReader; private static class Norm { public Norm(InputStream in) { this.in = in; } @@ -101,32 +104,42 @@ final class SegmentReader extends IndexReader { super(si.dir); segment = si.name; - fieldInfos = new FieldInfos(directory, segment + ".fnm"); - fieldsReader = new FieldsReader(directory, segment, fieldInfos); + // Use compound file directory for some files, if it exists + Directory cfsDir = directory; + if (directory.fileExists(segment + ".cfs")) { + cfsReader = new CompoundFileReader(directory, segment + ".cfs"); + cfsDir = cfsReader; + } - tis = new TermInfosReader(directory, segment, fieldInfos); + // No compound file exists - use the multi-file format + fieldInfos = new FieldInfos(cfsDir, segment + ".fnm"); + fieldsReader = new FieldsReader(cfsDir, segment, fieldInfos); + tis = new TermInfosReader(cfsDir, segment, fieldInfos); + + // NOTE: the bitvector is stored using the regular directory, not cfs if (hasDeletions(si)) deletedDocs = new BitVector(directory, segment + ".del"); // make sure that all index files have been read or are kept open // so that if an index update removes them we'll still have them - freqStream = directory.openFile(segment + ".frq"); - proxStream = directory.openFile(segment + ".prx"); - openNorms(); + freqStream = cfsDir.openFile(segment + ".frq"); + proxStream = cfsDir.openFile(segment + ".prx"); + openNorms(cfsDir); } + final synchronized void doClose() throws IOException { if (deletedDocsDirty) { synchronized (directory) { // in- & inter-process sync - new Lock.With(directory.makeLock("commit.lock"), IndexWriter.COMMIT_LOCK_TIMEOUT) { - public Object doBody() throws IOException { - deletedDocs.write(directory, segment + ".tmp"); - directory.renameFile(segment + ".tmp", segment + ".del"); + new Lock.With(directory.makeLock("commit.lock"), IndexWriter.COMMIT_LOCK_TIMEOUT) { + public Object doBody() throws IOException { + deletedDocs.write(directory, segment + ".tmp"); + directory.renameFile(segment + ".tmp", segment + ".del"); directory.touchFile("segments"); - return null; - } - }.run(); + return null; + } + }.run(); } deletedDocsDirty = false; } @@ -140,6 +153,9 @@ final class SegmentReader extends IndexReader { proxStream.close(); closeNorms(); + + if (cfsReader != null) + cfsReader.close(); if (closeDirectory) directory.close(); @@ -149,6 +165,10 @@ final class SegmentReader extends IndexReader { return si.dir.fileExists(si.name + ".del"); } + static final boolean usesCompoundFile(SegmentInfo si) throws IOException { + return si.dir.fileExists(si.name + ".cfs"); + } + final synchronized void doDelete(int docNum) throws IOException { if (deletedDocs == null) deletedDocs = new BitVector(maxDoc()); @@ -158,21 +178,20 @@ final class SegmentReader extends IndexReader { final Vector files() throws IOException { Vector files = new Vector(16); - files.addElement(segment + ".fnm"); - files.addElement(segment + ".fdx"); - files.addElement(segment + ".fdt"); - files.addElement(segment + ".tii"); - files.addElement(segment + ".tis"); - files.addElement(segment + ".frq"); - files.addElement(segment + ".prx"); - - if (directory.fileExists(segment + ".del")) - files.addElement(segment + ".del"); - + final String ext[] = new String[] { + "cfs", "fnm", "fdx", "fdt", "tii", "tis", "frq", "prx", "del" + }; + + for (int i=0; i. + */ + +import java.util.GregorianCalendar; +import java.io.PrintWriter; +import java.io.StringWriter; + +import junit.framework.TestCase; +import junit.framework.TestSuite; +import junit.textui.TestRunner; + +import org.apache.lucene.store.*; +import org.apache.lucene.document.*; +import org.apache.lucene.analysis.*; +import org.apache.lucene.index.*; +import org.apache.lucene.search.*; +import org.apache.lucene.queryParser.*; + +/** JUnit adaptation of an older test case SearchTest. + * @author dmitrys@earthlink.net + * @version $Id$ + */ +public class TestSearch extends TestCase { + + /** Main for running test case by itself. */ + public static void main(String args[]) { + TestRunner.run (new TestSuite(TestSearch.class)); + } + + /** This test performs a number of searches. It also compares output + * of searches using multi-file index segments with single-file + * index segments. + * + * TODO: someone should check that the results of the searches are + * still correct by adding assert statements. Right now, the test + * passes if the results are the same between multi-file and + * single-file formats, even if the results are wrong. + */ + public void testSearch() throws Exception { + StringWriter sw = new StringWriter(); + PrintWriter pw = new PrintWriter(sw, true); + doTestSearch(pw, false); + pw.close(); + sw.close(); + String multiFileOutput = sw.getBuffer().toString(); + System.out.println(multiFileOutput); + + sw = new StringWriter(); + pw = new PrintWriter(sw, true); + doTestSearch(pw, true); + pw.close(); + sw.close(); + String singleFileOutput = sw.getBuffer().toString(); + + assertEquals(multiFileOutput, singleFileOutput); + } + + + private void doTestSearch(PrintWriter out, boolean useCompoundFile) + throws Exception + { + Directory directory = new RAMDirectory(); + Analyzer analyzer = new SimpleAnalyzer(); + IndexWriter writer = new IndexWriter(directory, analyzer, true); + + writer.setUseCompoundFile(useCompoundFile); + + String[] docs = { + "a b c d e", + "a b c d e a b c d e", + "a b c d e f g h i j", + "a c e", + "e c a", + "a c e a c e", + "a c e a b c" + }; + for (int j = 0; j < docs.length; j++) { + Document d = new Document(); + d.add(Field.Text("contents", docs[j])); + writer.addDocument(d); + } + writer.close(); + + Searcher searcher = new IndexSearcher(directory); + + String[] queries = { + "a b", + "\"a b\"", + "\"a b c\"", + "a c", + "\"a c\"", + "\"a c e\"", + }; + Hits hits = null; + + QueryParser parser = new QueryParser("contents", analyzer); + parser.setPhraseSlop(4); + for (int j = 0; j < queries.length; j++) { + Query query = parser.parse(queries[j]); + out.println("Query: " + query.toString("contents")); + + //DateFilter filter = + // new DateFilter("modified", Time(1997,0,1), Time(1998,0,1)); + //DateFilter filter = DateFilter.Before("modified", Time(1997,00,01)); + //System.out.println(filter); + + hits = searcher.search(query); + + out.println(hits.length() + " total results"); + for (int i = 0 ; i < hits.length() && i < 10; i++) { + Document d = hits.doc(i); + out.println(i + " " + hits.score(i) +// + " " + DateField.stringToDate(d.get("modified")) + + " " + d.get("contents")); + } + } + searcher.close(); + } + + static long Time(int year, int month, int day) { + GregorianCalendar calendar = new GregorianCalendar(); + calendar.set(year, month, day); + return calendar.getTime().getTime(); + } +} diff --git a/src/test/org/apache/lucene/TestSearchForDuplicates.java b/src/test/org/apache/lucene/TestSearchForDuplicates.java new file mode 100644 index 00000000000..bc6d3b61d68 --- /dev/null +++ b/src/test/org/apache/lucene/TestSearchForDuplicates.java @@ -0,0 +1,190 @@ +package org.apache.lucene; + +/* ==================================================================== + * The Apache Software License, Version 1.1 + * + * Copyright (c) 2001 The Apache Software Foundation. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, + * if any, must include the following acknowledgment: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, + * if and wherever such third-party acknowledgments normally appear. + * + * 4. The names "Apache" and "Apache Software Foundation" and + * "Apache Lucene" must not be used to endorse or promote products + * derived from this software without prior written permission. For + * written permission, please contact apache@apache.org. + * + * 5. Products derived from this software may not be called "Apache", + * "Apache Lucene", nor may "Apache" appear in their name, without + * prior written permission of the Apache Software Foundation. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + */ + +import java.io.IOException; +import java.io.PrintWriter; +import java.io.StringWriter; + +import org.apache.lucene.store.*; +import org.apache.lucene.document.*; +import org.apache.lucene.analysis.*; +import org.apache.lucene.index.*; +import org.apache.lucene.search.*; +import org.apache.lucene.queryParser.*; + +import junit.framework.TestCase; +import junit.framework.TestSuite; +import junit.textui.TestRunner; + + +/** JUnit adaptation of an older test case DocTest. + * @author dmitrys@earthlink.net + * @version $Id$ + */ +public class TestSearchForDuplicates extends TestCase { + + /** Main for running test case by itself. */ + public static void main(String args[]) { + TestRunner.run (new TestSuite(TestSearchForDuplicates.class)); + } + + + + static final String PRIORITY_FIELD ="priority"; + static final String ID_FIELD ="id"; + static final String HIGH_PRIORITY ="high"; + static final String MED_PRIORITY ="medium"; + static final String LOW_PRIORITY ="low"; + + + /** This test compares search results when using and not using compound + * files. + * + * TODO: There is rudimentary search result validation as well, but it is + * simply based on asserting the output observed in the old test case, + * without really knowing if the output is correct. Someone needs to + * validate this output and make any changes to the checkHits method. + */ + public void testRun() throws Exception { + StringWriter sw = new StringWriter(); + PrintWriter pw = new PrintWriter(sw, true); + doTest(pw, false); + pw.close(); + sw.close(); + String multiFileOutput = sw.getBuffer().toString(); + System.out.println(multiFileOutput); + + sw = new StringWriter(); + pw = new PrintWriter(sw, true); + doTest(pw, true); + pw.close(); + sw.close(); + String singleFileOutput = sw.getBuffer().toString(); + + assertEquals(multiFileOutput, singleFileOutput); + } + + + private void doTest(PrintWriter out, boolean useCompoundFiles) throws Exception { + Directory directory = new RAMDirectory(); + Analyzer analyzer = new SimpleAnalyzer(); + IndexWriter writer = new IndexWriter(directory, analyzer, true); + + writer.setUseCompoundFile(useCompoundFiles); + + final int MAX_DOCS = 225; + + for (int j = 0; j < MAX_DOCS; j++) { + Document d = new Document(); + d.add(Field.Text(PRIORITY_FIELD, HIGH_PRIORITY)); + d.add(Field.Text(ID_FIELD, Integer.toString(j))); + writer.addDocument(d); + } + writer.close(); + + // try a search without OR + Searcher searcher = new IndexSearcher(directory); + Hits hits = null; + + QueryParser parser = new QueryParser(PRIORITY_FIELD, analyzer); + + Query query = parser.parse(HIGH_PRIORITY); + out.println("Query: " + query.toString(PRIORITY_FIELD)); + + hits = searcher.search(query); + printHits(out, hits); + checkHits(hits, MAX_DOCS); + + searcher.close(); + + // try a new search with OR + searcher = new IndexSearcher(directory); + hits = null; + + parser = new QueryParser(PRIORITY_FIELD, analyzer); + + query = parser.parse(HIGH_PRIORITY + " OR " + MED_PRIORITY); + out.println("Query: " + query.toString(PRIORITY_FIELD)); + + hits = searcher.search(query); + printHits(out, hits); + checkHits(hits, MAX_DOCS); + + searcher.close(); + } + + + private void printHits(PrintWriter out, Hits hits ) throws IOException { + out.println(hits.length() + " total results\n"); + for (int i = 0 ; i < hits.length(); i++) { + if ( i < 10 || (i > 94 && i < 105) ) { + Document d = hits.doc(i); + out.println(i + " " + d.get(ID_FIELD)); + } + } + } + + private void checkHits(Hits hits, int expectedCount) throws IOException { + assertEquals("total results", expectedCount, hits.length()); + for (int i = 0 ; i < hits.length(); i++) { + if ( i < 10 || (i > 94 && i < 105) ) { + Document d = hits.doc(i); + assertEquals("check " + i, String.valueOf(i), d.get(ID_FIELD)); + } + } + } + +} diff --git a/src/test/org/apache/lucene/ThreadSafetyTest.java b/src/test/org/apache/lucene/ThreadSafetyTest.java index 50f06a1e45b..017c92e72a5 100644 --- a/src/test/org/apache/lucene/ThreadSafetyTest.java +++ b/src/test/org/apache/lucene/ThreadSafetyTest.java @@ -62,6 +62,7 @@ import org.apache.lucene.search.*; import org.apache.lucene.queryParser.*; import java.util.Random; +import java.io.File; class ThreadSafetyTest { private static final Analyzer ANALYZER = new SimpleAnalyzer(); @@ -86,26 +87,33 @@ class ThreadSafetyTest { public void run() { try { - for (int i = 0; i < 1024*ITERATIONS; i++) { - Document d = new Document(); - int n = RANDOM.nextInt(); - d.add(Field.Keyword("id", Integer.toString(n))); - d.add(Field.UnStored("contents", intToEnglish(n))); - System.out.println("Adding " + n); - writer.addDocument(d); + boolean useCompoundFiles = false; + + for (int i = 0; i < 1024*ITERATIONS; i++) { + Document d = new Document(); + int n = RANDOM.nextInt(); + d.add(Field.Keyword("id", Integer.toString(n))); + d.add(Field.UnStored("contents", intToEnglish(n))); + System.out.println("Adding " + n); + + // Switch between single and multiple file segments + useCompoundFiles = Math.random() < 0.5; + writer.setUseCompoundFile(useCompoundFiles); + + writer.addDocument(d); - if (i%reopenInterval == 0) { - writer.close(); - writer = new IndexWriter("index", ANALYZER, false); - } - } - - writer.close(); + if (i%reopenInterval == 0) { + writer.close(); + writer = new IndexWriter("index", ANALYZER, false); + } + } + + writer.close(); } catch (Exception e) { - System.out.println(e.toString()); - e.printStackTrace(); - System.exit(0); + System.out.println(e.toString()); + e.printStackTrace(); + System.exit(0); } } } @@ -116,26 +124,26 @@ class ThreadSafetyTest { public SearcherThread(boolean useGlobal) throws java.io.IOException { if (!useGlobal) - this.searcher = new IndexSearcher("index"); + this.searcher = new IndexSearcher("index"); } public void run() { try { - for (int i = 0; i < 512*ITERATIONS; i++) { - searchFor(RANDOM.nextInt(), (searcher==null)?SEARCHER:searcher); - if (i%reopenInterval == 0) { - if (searcher == null) { - SEARCHER = new IndexSearcher("index"); - } else { - searcher.close(); - searcher = new IndexSearcher("index"); - } - } - } + for (int i = 0; i < 512*ITERATIONS; i++) { + searchFor(RANDOM.nextInt(), (searcher==null)?SEARCHER:searcher); + if (i%reopenInterval == 0) { + if (searcher == null) { + SEARCHER = new IndexSearcher("index"); + } else { + searcher.close(); + searcher = new IndexSearcher("index"); + } + } + } } catch (Exception e) { - System.out.println(e.toString()); - e.printStackTrace(); - System.exit(0); + System.out.println(e.toString()); + e.printStackTrace(); + System.exit(0); } } @@ -143,11 +151,11 @@ class ThreadSafetyTest { throws Exception { System.out.println("Searching for " + n); Hits hits = - searcher.search(QueryParser.parse(intToEnglish(n), "contents", - ANALYZER)); + searcher.search(QueryParser.parse(intToEnglish(n), "contents", + ANALYZER)); System.out.println("Search for " + n + ": total=" + hits.length()); for (int j = 0; j < Math.min(3, hits.length()); j++) { - System.out.println("Hit for " + n + ": " + hits.doc(j).get("id")); + System.out.println("Hit for " + n + ": " + hits.doc(j).get("id")); } } } @@ -159,15 +167,18 @@ class ThreadSafetyTest { for (int i = 0; i < args.length; i++) { if ("-ro".equals(args[i])) - readOnly = true; + readOnly = true; if ("-add".equals(args[i])) - add = true; + add = true; } - IndexReader.unlock(FSDirectory.getDirectory("index", false)); + File indexDir = new File("index"); + if (! indexDir.exists()) indexDir.mkdirs(); + + IndexReader.unlock(FSDirectory.getDirectory(indexDir, false)); if (!readOnly) { - IndexWriter writer = new IndexWriter("index", ANALYZER, !add); + IndexWriter writer = new IndexWriter(indexDir, ANALYZER, !add); Thread indexerThread = new IndexerThread(writer); indexerThread.start(); @@ -178,7 +189,7 @@ class ThreadSafetyTest { SearcherThread searcherThread1 = new SearcherThread(false); searcherThread1.start(); - SEARCHER = new IndexSearcher("index"); + SEARCHER = new IndexSearcher(indexDir.toString()); SearcherThread searcherThread2 = new SearcherThread(true); searcherThread2.start(); @@ -231,9 +242,9 @@ class ThreadSafetyTest { } i = i%10; if (i == 0) - result.append(" "); + result.append(" "); else - result.append("-"); + result.append("-"); } switch (i) { case 19 : result.append("nineteen "); break; diff --git a/src/test/org/apache/lucene/index/DocTest.java b/src/test/org/apache/lucene/index/DocTest.java index 599a267edec..5aee4eaabe6 100644 --- a/src/test/org/apache/lucene/index/DocTest.java +++ b/src/test/org/apache/lucene/index/DocTest.java @@ -87,7 +87,8 @@ class DocTest { } catch (Exception e) { System.out.println(" caught a " + e.getClass() + - "\n with message: " + e.getMessage()); + "\n with message: " + e.getMessage()); + e.printStackTrace(); } } @@ -113,7 +114,7 @@ class DocTest { SegmentReader r1 = new SegmentReader(new SegmentInfo(seg1, 1, directory)); SegmentReader r2 = new SegmentReader(new SegmentInfo(seg2, 1, directory)); - SegmentMerger merger = new SegmentMerger(directory, merged); + SegmentMerger merger = new SegmentMerger(directory, merged, false); merger.add(r1); merger.add(r2); merger.merge(); @@ -137,17 +138,17 @@ class DocTest { TermPositions positions = reader.termPositions(tis.term()); try { - while (positions.next()) { - System.out.print(" doc=" + positions.doc()); - System.out.print(" TF=" + positions.freq()); - System.out.print(" pos="); - System.out.print(positions.nextPosition()); - for (int j = 1; j < positions.freq(); j++) - System.out.print("," + positions.nextPosition()); - System.out.println(""); - } + while (positions.next()) { + System.out.print(" doc=" + positions.doc()); + System.out.print(" TF=" + positions.freq()); + System.out.print(" pos="); + System.out.print(positions.nextPosition()); + for (int j = 1; j < positions.freq(); j++) + System.out.print("," + positions.nextPosition()); + System.out.println(""); + } } finally { - positions.close(); + positions.close(); } } tis.close(); diff --git a/src/test/org/apache/lucene/index/TestCompoundFile.java b/src/test/org/apache/lucene/index/TestCompoundFile.java new file mode 100644 index 00000000000..8a7cf958049 --- /dev/null +++ b/src/test/org/apache/lucene/index/TestCompoundFile.java @@ -0,0 +1,701 @@ +package org.apache.lucene.index; + +import java.io.IOException; + +import junit.framework.TestCase; +import junit.framework.TestSuite; +import junit.textui.TestRunner; + +import org.apache.lucene.store.*; +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; + + +/** + * @author dmitrys@earthlink.net + * @version $Id$ + */ +public class TestCompoundFile extends TestCase +{ + /** Main for running test case by itself. */ + public static void main(String args[]) { + TestRunner.run (new TestSuite(TestCompoundFile.class)); +// TestRunner.run (new TestCompoundFile("testSingleFile")); +// TestRunner.run (new TestCompoundFile("testTwoFiles")); +// TestRunner.run (new TestCompoundFile("testRandomFiles")); +// TestRunner.run (new TestCompoundFile("testClonedStreamsClosing")); +// TestRunner.run (new TestCompoundFile("testReadAfterClose")); +// TestRunner.run (new TestCompoundFile("testRandomAccess")); +// TestRunner.run (new TestCompoundFile("testRandomAccessClones")); +// TestRunner.run (new TestCompoundFile("testFileNotFound")); +// TestRunner.run (new TestCompoundFile("testReadPastEOF")); + +// TestRunner.run (new TestCompoundFile("testIWCreate")); + + } + + + public TestCompoundFile() { + super(); + } + + public TestCompoundFile(String name) { + super(name); + } + + private Directory dir; + + + public void setUp() throws IOException { + //dir = new RAMDirectory(); + dir = FSDirectory.getDirectory("testIndex", true); + } + + + /** Creates a file of the specified size with random data. */ + private void createRandomFile(Directory dir, String name, int size) + throws IOException + { + OutputStream os = dir.createFile(name); + for (int i=0; i 0) { + int readLen = (int) Math.min(remainder, expectedBuffer.length); + expected.readBytes(expectedBuffer, 0, readLen); + test.readBytes(testBuffer, 0, readLen); + assertEqualArrays(msg + ", remainder " + remainder, expectedBuffer, + testBuffer, 0, readLen); + remainder -= readLen; + } + } + + + private void assertSameStreams(String msg, + InputStream expected, + InputStream actual, + long seekTo) + throws IOException + { + if (seekTo < 0) { + try { + actual.seek(seekTo); + fail(msg + ", " + seekTo + ", negative seek"); + } catch (IOException e) { + /* success */ + //System.out.println("SUCCESS: Negative seek: " + e); + } + + } else if (seekTo > 0 && seekTo >= expected.length()) { + try { + actual.seek(seekTo); + fail(msg + ", " + seekTo + ", seek past EOF"); + } catch (IOException e) { + /* success */ + //System.out.println("SUCCESS: Seek past EOF: " + e); + } + + } else { + expected.seek(seekTo); + actual.seek(seekTo); + assertSameStreams(msg + ", seek(mid)", expected, actual); + } + } + + + + private void assertSameSeekBehavior(String msg, + InputStream expected, + InputStream actual) + throws IOException + { + // seek to 0 + long point = 0; + assertSameStreams(msg + ", seek(0)", expected, actual, point); + + // seek to middle + point = expected.length() / 2l; + assertSameStreams(msg + ", seek(mid)", expected, actual, point); + + // seek to end - 2 + point = expected.length() - 2; + assertSameStreams(msg + ", seek(end-2)", expected, actual, point); + + // seek to end - 1 + point = expected.length() - 1; + assertSameStreams(msg + ", seek(end-1)", expected, actual, point); + + // seek to the end + point = expected.length(); + assertSameStreams(msg + ", seek(end)", expected, actual, point); + + // seek past end + point = expected.length() + 1; + assertSameStreams(msg + ", seek(end+1)", expected, actual, point); + } + + + private void assertEqualArrays(String msg, + byte[] expected, + byte[] test, + int start, + int len) + { + assertNotNull(msg + " null expected", expected); + assertNotNull(msg + " null test", test); + + for (int i=start; i. + */ +import junit.framework.TestCase; +import junit.framework.TestSuite; +import junit.textui.TestRunner; + + +import org.apache.lucene.analysis.SimpleAnalyzer; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.Directory; +import org.apache.lucene.document.Document; +import org.apache.lucene.search.Similarity; +import org.apache.lucene.demo.FileDocument; + +import java.io.*; +import java.util.*; + + +/** JUnit adaptation of an older test case DocTest. + * @author dmitrys@earthlink.net + * @version $Id$ + */ +public class TestDoc extends TestCase { + + /** Main for running test case by itself. */ + public static void main(String args[]) { + TestRunner.run (new TestSuite(TestDoc.class)); + } + + + private File workDir; + private File indexDir; + private LinkedList files; + + + /** Set the test case. This test case needs + * a few text files created in the current working directory. + */ + public void setUp() throws IOException { + workDir = new File("TestDoc"); + workDir.mkdirs(); + + indexDir = new File(workDir, "testIndex"); + indexDir.mkdirs(); + + Directory directory = FSDirectory.getDirectory(indexDir, true); + directory.close(); + + files = new LinkedList(); + files.add(createFile("test.txt", + "This is the first test file" + )); + + files.add(createFile("test2.txt", + "This is the second test file" + )); + } + + private File createFile(String name, String text) throws IOException { + FileWriter fw = null; + PrintWriter pw = null; + + try { + File f = new File(workDir, name); + if (f.exists()) f.delete(); + + fw = new FileWriter(f); + pw = new PrintWriter(fw); + pw.println(text); + return f; + + } finally { + if (pw != null) pw.close(); + if (fw != null) fw.close(); + } + } + + + /** This test executes a number of merges and compares the contents of + * the segments created when using compound file or not using one. + * + * TODO: the original test used to print the segment contents to System.out + * for visual validation. To have the same effect, a new method + * checkSegment(String name, ...) should be created that would + * assert various things about the segment. + */ + public void testIndexAndMerge() throws Exception { + StringWriter sw = new StringWriter(); + PrintWriter out = new PrintWriter(sw, true); + + Directory directory = FSDirectory.getDirectory(indexDir, true); + directory.close(); + + indexDoc("one", "test.txt"); + printSegment(out, "one"); + + indexDoc("two", "test2.txt"); + printSegment(out, "two"); + + merge("one", "two", "merge", false); + printSegment(out, "merge"); + + merge("one", "two", "merge2", false); + printSegment(out, "merge2"); + + merge("merge", "merge2", "merge3", false); + printSegment(out, "merge3"); + + out.close(); + sw.close(); + String multiFileOutput = sw.getBuffer().toString(); + System.out.println(multiFileOutput); + + sw = new StringWriter(); + out = new PrintWriter(sw, true); + + directory = FSDirectory.getDirectory(indexDir, true); + directory.close(); + + indexDoc("one", "test.txt"); + printSegment(out, "one"); + + indexDoc("two", "test2.txt"); + printSegment(out, "two"); + + merge("one", "two", "merge", true); + printSegment(out, "merge"); + + merge("one", "two", "merge2", true); + printSegment(out, "merge2"); + + merge("merge", "merge2", "merge3", true); + printSegment(out, "merge3"); + + out.close(); + sw.close(); + String singleFileOutput = sw.getBuffer().toString(); + + assertEquals(multiFileOutput, singleFileOutput); + } + + + private void indexDoc(String segment, String fileName) + throws Exception + { + Directory directory = FSDirectory.getDirectory(indexDir, false); + Analyzer analyzer = new SimpleAnalyzer(); + DocumentWriter writer = + new DocumentWriter(directory, analyzer, Similarity.getDefault(), 1000); + + File file = new File(workDir, fileName); + Document doc = FileDocument.Document(file); + + writer.addDocument(segment, doc); + + directory.close(); + } + + + private void merge(String seg1, String seg2, String merged, boolean useCompoundFile) + throws Exception { + Directory directory = FSDirectory.getDirectory(indexDir, false); + + SegmentReader r1 = new SegmentReader(new SegmentInfo(seg1, 1, directory)); + SegmentReader r2 = new SegmentReader(new SegmentInfo(seg2, 1, directory)); + + SegmentMerger merger = + new SegmentMerger(directory, merged, useCompoundFile); + + merger.add(r1); + merger.add(r2); + merger.merge(); + + directory.close(); + } + + + private void printSegment(PrintWriter out, String segment) + throws Exception { + Directory directory = FSDirectory.getDirectory(indexDir, false); + SegmentReader reader = + new SegmentReader(new SegmentInfo(segment, 1, directory)); + + for (int i = 0; i < reader.numDocs(); i++) + out.println(reader.document(i)); + + TermEnum tis = reader.terms(); + while (tis.next()) { + out.print(tis.term()); + out.println(" DF=" + tis.docFreq()); + + TermPositions positions = reader.termPositions(tis.term()); + try { + while (positions.next()) { + out.print(" doc=" + positions.doc()); + out.print(" TF=" + positions.freq()); + out.print(" pos="); + out.print(positions.nextPosition()); + for (int j = 1; j < positions.freq(); j++) + out.print("," + positions.nextPosition()); + out.println(""); + } + } finally { + positions.close(); + } + } + tis.close(); + reader.close(); + directory.close(); + } +} diff --git a/src/test/org/apache/lucene/store/_TestHelper.java b/src/test/org/apache/lucene/store/_TestHelper.java new file mode 100644 index 00000000000..0cd0d9cbae3 --- /dev/null +++ b/src/test/org/apache/lucene/store/_TestHelper.java @@ -0,0 +1,47 @@ +package org.apache.lucene.store; +import java.io.RandomAccessFile; +import java.io.IOException; + +/** This class provides access to package-level features defined in the + * store package. It is used for testing only. + */ + +public class _TestHelper { + + /** Returns true if the instance of the provided input stream is actually + * an FSInputStream. + */ + public static boolean isFSInputStream(InputStream is) { + return is instanceof FSInputStream; + } + + /** Returns true if the provided input stream is an FSInputStream and + * is a clone, that is it does not own its underlying file descriptor. + */ + public static boolean isFSInputStreamClone(InputStream is) { + if (isFSInputStream(is)) { + return ((FSInputStream) is).isClone; + } else { + return false; + } + } + + /** Given an instance of FSDirectory.FSInputStream, this method returns + * true if the underlying file descriptor is valid, and false otherwise. + * This can be used to determine if the OS file has been closed. + * The descriptor becomes invalid when the non-clone instance of the + * FSInputStream that owns this descriptor is closed. However, the + * descriptor may possibly become invalid in other ways as well. + */ + public static boolean isFSInputStreamOpen(InputStream is) + throws IOException + { + if (isFSInputStream(is)) { + FSInputStream fis = (FSInputStream) is; + return fis.isFDValid(); + } else { + return false; + } + } + +} \ No newline at end of file