mirror of https://github.com/apache/lucene.git
LUCENE-843: speed up IndexWriter performance
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@553236 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4fe90e4086
commit
cff5767e44
13
CHANGES.txt
13
CHANGES.txt
|
@ -105,6 +105,13 @@ API Changes
|
|||
to be public because it implements the public interface TermPositionVector.
|
||||
(Michael Busch)
|
||||
|
||||
14. LUCENE-843: Added IndexWriter.setRAMBufferSizeMB(...) to have
|
||||
IndexWriter flush whenever the buffered documents are using more
|
||||
than the specified amount of RAM. Also added new APIs to Token
|
||||
that allow one to set a char[] plus offset and length to specify a
|
||||
token (to avoid creating a new String() for each Token). (Mike
|
||||
McCandless)
|
||||
|
||||
Bug fixes
|
||||
|
||||
1. LUCENE-804: Fixed build.xml to pack a fully compilable src dist. (Doron Cohen)
|
||||
|
@ -268,6 +275,12 @@ Optimizations
|
|||
contain very frequent and very unique terms the speedup can be over 80%.
|
||||
(Michael Busch)
|
||||
|
||||
8. LUCENE-843: Substantial optimizations to improve how IndexWriter
|
||||
uses RAM for buffering documents and to speed up indexing (2X-8X
|
||||
faster). A single shared hash table now records the in-memory
|
||||
postings per unique term and is directly flushed into a single
|
||||
segment. (Mike McCandless)
|
||||
|
||||
Documentation
|
||||
|
||||
1. LUCENE 791 && INFRA-1173: Infrastructure moved the Wiki to
|
||||
|
|
|
@ -77,6 +77,7 @@ public class TestParser extends TestCase {
|
|||
line=d.readLine();
|
||||
}
|
||||
d.close();
|
||||
writer.close();
|
||||
}
|
||||
reader=IndexReader.open(dir);
|
||||
searcher=new IndexSearcher(reader);
|
||||
|
|
|
@ -380,10 +380,18 @@ document.write("Last Published: " + document.lastModified);
|
|||
But note that once a commit has occurred, pre-2.1
|
||||
Lucene will not be able to read the index.
|
||||
</p>
|
||||
<p>
|
||||
In version 2.3, the file format was changed to allow
|
||||
segments to share a single set of doc store (vectors &
|
||||
stored fields) files. This allows for faster indexing
|
||||
in certain cases. The change is fully backwards
|
||||
compatible (in the same way as the lock-less commits
|
||||
change in 2.1).
|
||||
</p>
|
||||
</div>
|
||||
|
||||
|
||||
<a name="N10032"></a><a name="Definitions"></a>
|
||||
<a name="N10035"></a><a name="Definitions"></a>
|
||||
<h2 class="boxed">Definitions</h2>
|
||||
<div class="section">
|
||||
<p>
|
||||
|
@ -424,7 +432,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
strings, the first naming the field, and the second naming text
|
||||
within the field.
|
||||
</p>
|
||||
<a name="N10052"></a><a name="Inverted Indexing"></a>
|
||||
<a name="N10055"></a><a name="Inverted Indexing"></a>
|
||||
<h3 class="boxed">Inverted Indexing</h3>
|
||||
<p>
|
||||
The index stores statistics about terms in order
|
||||
|
@ -434,7 +442,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
it. This is the inverse of the natural relationship, in which
|
||||
documents list terms.
|
||||
</p>
|
||||
<a name="N1005E"></a><a name="Types of Fields"></a>
|
||||
<a name="N10061"></a><a name="Types of Fields"></a>
|
||||
<h3 class="boxed">Types of Fields</h3>
|
||||
<p>
|
||||
In Lucene, fields may be <i>stored</i>, in which
|
||||
|
@ -448,7 +456,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
to be indexed literally.
|
||||
</p>
|
||||
<p>See the <a href="http://lucene.apache.org/java/docs/api/org/apache/lucene/document/Field.html">Field</a> java docs for more information on Fields.</p>
|
||||
<a name="N1007B"></a><a name="Segments"></a>
|
||||
<a name="N1007E"></a><a name="Segments"></a>
|
||||
<h3 class="boxed">Segments</h3>
|
||||
<p>
|
||||
Lucene indexes may be composed of multiple sub-indexes, or
|
||||
|
@ -474,7 +482,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
Searches may involve multiple segments and/or multiple indexes, each
|
||||
index potentially composed of a set of segments.
|
||||
</p>
|
||||
<a name="N10099"></a><a name="Document Numbers"></a>
|
||||
<a name="N1009C"></a><a name="Document Numbers"></a>
|
||||
<h3 class="boxed">Document Numbers</h3>
|
||||
<p>
|
||||
Internally, Lucene refers to documents by an integer <i>document
|
||||
|
@ -529,7 +537,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
</div>
|
||||
|
||||
|
||||
<a name="N100C0"></a><a name="Overview"></a>
|
||||
<a name="N100C3"></a><a name="Overview"></a>
|
||||
<h2 class="boxed">Overview</h2>
|
||||
<div class="section">
|
||||
<p>
|
||||
|
@ -626,7 +634,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
</div>
|
||||
|
||||
|
||||
<a name="N10103"></a><a name="File Naming"></a>
|
||||
<a name="N10106"></a><a name="File Naming"></a>
|
||||
<h2 class="boxed">File Naming</h2>
|
||||
<div class="section">
|
||||
<p>
|
||||
|
@ -654,10 +662,10 @@ document.write("Last Published: " + document.lastModified);
|
|||
</div>
|
||||
|
||||
|
||||
<a name="N10112"></a><a name="Primitive Types"></a>
|
||||
<a name="N10115"></a><a name="Primitive Types"></a>
|
||||
<h2 class="boxed">Primitive Types</h2>
|
||||
<div class="section">
|
||||
<a name="N10117"></a><a name="Byte"></a>
|
||||
<a name="N1011A"></a><a name="Byte"></a>
|
||||
<h3 class="boxed">Byte</h3>
|
||||
<p>
|
||||
The most primitive type
|
||||
|
@ -665,7 +673,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
other data types are defined as sequences
|
||||
of bytes, so file formats are byte-order independent.
|
||||
</p>
|
||||
<a name="N10120"></a><a name="UInt32"></a>
|
||||
<a name="N10123"></a><a name="UInt32"></a>
|
||||
<h3 class="boxed">UInt32</h3>
|
||||
<p>
|
||||
32-bit unsigned integers are written as four
|
||||
|
@ -675,7 +683,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
UInt32 --> <Byte><sup>4</sup>
|
||||
|
||||
</p>
|
||||
<a name="N1012F"></a><a name="Uint64"></a>
|
||||
<a name="N10132"></a><a name="Uint64"></a>
|
||||
<h3 class="boxed">Uint64</h3>
|
||||
<p>
|
||||
64-bit unsigned integers are written as eight
|
||||
|
@ -684,7 +692,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
<p>UInt64 --> <Byte><sup>8</sup>
|
||||
|
||||
</p>
|
||||
<a name="N1013E"></a><a name="VInt"></a>
|
||||
<a name="N10141"></a><a name="VInt"></a>
|
||||
<h3 class="boxed">VInt</h3>
|
||||
<p>
|
||||
A variable-length format for positive integers is
|
||||
|
@ -1234,7 +1242,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
This provides compression while still being
|
||||
efficient to decode.
|
||||
</p>
|
||||
<a name="N10423"></a><a name="Chars"></a>
|
||||
<a name="N10426"></a><a name="Chars"></a>
|
||||
<h3 class="boxed">Chars</h3>
|
||||
<p>
|
||||
Lucene writes unicode
|
||||
|
@ -1243,7 +1251,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
UTF-8 encoding"</a>
|
||||
.
|
||||
</p>
|
||||
<a name="N10430"></a><a name="String"></a>
|
||||
<a name="N10433"></a><a name="String"></a>
|
||||
<h3 class="boxed">String</h3>
|
||||
<p>
|
||||
Lucene writes strings as a VInt representing the length, followed by
|
||||
|
@ -1255,13 +1263,13 @@ document.write("Last Published: " + document.lastModified);
|
|||
</div>
|
||||
|
||||
|
||||
<a name="N1043D"></a><a name="Per-Index Files"></a>
|
||||
<a name="N10440"></a><a name="Per-Index Files"></a>
|
||||
<h2 class="boxed">Per-Index Files</h2>
|
||||
<div class="section">
|
||||
<p>
|
||||
The files in this section exist one-per-index.
|
||||
</p>
|
||||
<a name="N10445"></a><a name="Segments File"></a>
|
||||
<a name="N10448"></a><a name="Segments File"></a>
|
||||
<h3 class="boxed">Segments File</h3>
|
||||
<p>
|
||||
The active segments in the index are stored in the
|
||||
|
@ -1316,16 +1324,24 @@ document.write("Last Published: " + document.lastModified);
|
|||
|
||||
</p>
|
||||
<p>
|
||||
Format, NameCounter, SegCount, SegSize, NumField --> Int32
|
||||
|
||||
<b>2.3 and above:</b>
|
||||
Segments --> Format, Version, NameCounter, SegCount, <SegName, SegSize, DelGen, DocStoreOffset, [DocStoreSegment, DocStoreIsCompoundFile], HasSingleNormFile, NumField,
|
||||
NormGen<sup>NumField</sup>,
|
||||
IsCompoundFile><sup>SegCount</sup>
|
||||
|
||||
</p>
|
||||
<p>
|
||||
Format, NameCounter, SegCount, SegSize, NumField, DocStoreOffset --> Int32
|
||||
</p>
|
||||
<p>
|
||||
Version, DelGen, NormGen --> Int64
|
||||
</p>
|
||||
<p>
|
||||
SegName --> String
|
||||
SegName, DocStoreSegment --> String
|
||||
</p>
|
||||
<p>
|
||||
IsCompoundFile, HasSingleNormFile --> Int8
|
||||
IsCompoundFile, HasSingleNormFile, DocStoreIsCompoundFile --> Int8
|
||||
</p>
|
||||
<p>
|
||||
Format is -1 as of Lucene 1.4 and -3 (SemgentInfos.FORMAT_SINGLE_NORM_FILE) as of Lucene 2.1.
|
||||
|
@ -1380,7 +1396,28 @@ document.write("Last Published: " + document.lastModified);
|
|||
are stored as separate <tt>.fN</tt> files. See
|
||||
"Normalization Factors" below for details.
|
||||
</p>
|
||||
<a name="N104A9"></a><a name="Lock File"></a>
|
||||
<p>
|
||||
DocStoreOffset, DocStoreSegment,
|
||||
DocStoreIsCompoundFile: If DocStoreOffset is -1,
|
||||
this segment has its own doc store (stored fields
|
||||
values and term vectors) files and DocStoreSegment
|
||||
and DocStoreIsCompoundFile are not stored. In
|
||||
this case all files for stored field values
|
||||
(<tt>*.fdt</tt> and <tt>*.fdx</tt>) and term
|
||||
vectors (<tt>*.tvf</tt>, <tt>*.tvd</tt> and
|
||||
<tt>*.tvx</tt>) will be stored with this segment.
|
||||
Otherwise, DocStoreSegment is the name of the
|
||||
segment that has the shared doc store files;
|
||||
DocStoreIsCompoundFile is 1 if that segment is
|
||||
stored in compound file format (as a <tt>.cfx</tt>
|
||||
file); and DocStoreOffset is the starting document
|
||||
in the shared doc store files where this segment's
|
||||
documents begin. In this case, this segment does
|
||||
not store its own doc store files but instead
|
||||
shares a single set of these files with other
|
||||
segments.
|
||||
</p>
|
||||
<a name="N104CD"></a><a name="Lock File"></a>
|
||||
<h3 class="boxed">Lock File</h3>
|
||||
<p>
|
||||
The write lock, which is stored in the index
|
||||
|
@ -1398,7 +1435,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
Note that prior to version 2.1, Lucene also used a
|
||||
commit lock. This was removed in 2.1.
|
||||
</p>
|
||||
<a name="N104B5"></a><a name="Deletable File"></a>
|
||||
<a name="N104D9"></a><a name="Deletable File"></a>
|
||||
<h3 class="boxed">Deletable File</h3>
|
||||
<p>
|
||||
Prior to Lucene 2.1 there was a file "deletable"
|
||||
|
@ -1407,7 +1444,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
the files that are deletable, instead, so no file
|
||||
is written.
|
||||
</p>
|
||||
<a name="N104BE"></a><a name="Compound Files"></a>
|
||||
<a name="N104E2"></a><a name="Compound Files"></a>
|
||||
<h3 class="boxed">Compound Files</h3>
|
||||
<p>Starting with Lucene 1.4 the compound file format became default. This
|
||||
is simply a container for all files described in the next section
|
||||
|
@ -1424,17 +1461,24 @@ document.write("Last Published: " + document.lastModified);
|
|||
<p>FileName --> String</p>
|
||||
<p>FileData --> raw file data</p>
|
||||
<p>The raw file data is the data from the individual files named above.</p>
|
||||
<p>Starting with Lucene 2.3, doc store files (stored
|
||||
field values and term vectors) can be shared in a
|
||||
single set of files for more than one segment. When
|
||||
compound file is enabled, these shared files will be
|
||||
added into a single compound file (same format as
|
||||
above) but with the extension <tt>.cfx</tt>.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
|
||||
<a name="N104E0"></a><a name="Per-Segment Files"></a>
|
||||
<a name="N1050A"></a><a name="Per-Segment Files"></a>
|
||||
<h2 class="boxed">Per-Segment Files</h2>
|
||||
<div class="section">
|
||||
<p>
|
||||
The remaining files are all per-segment, and are
|
||||
thus defined by suffix.
|
||||
</p>
|
||||
<a name="N104E8"></a><a name="Fields"></a>
|
||||
<a name="N10512"></a><a name="Fields"></a>
|
||||
<h3 class="boxed">Fields</h3>
|
||||
<p>
|
||||
|
||||
|
@ -1653,7 +1697,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
</li>
|
||||
|
||||
</ol>
|
||||
<a name="N105A3"></a><a name="Term Dictionary"></a>
|
||||
<a name="N105CD"></a><a name="Term Dictionary"></a>
|
||||
<h3 class="boxed">Term Dictionary</h3>
|
||||
<p>
|
||||
The term dictionary is represented as two files:
|
||||
|
@ -1839,7 +1883,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
</li>
|
||||
|
||||
</ol>
|
||||
<a name="N10623"></a><a name="Frequencies"></a>
|
||||
<a name="N1064D"></a><a name="Frequencies"></a>
|
||||
<h3 class="boxed">Frequencies</h3>
|
||||
<p>
|
||||
The .frq file contains the lists of documents
|
||||
|
@ -1957,7 +2001,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
entry in level-1. In the example has entry 15 on level 1 a pointer to entry 15 on level 0 and entry 31 on level 1 a pointer
|
||||
to entry 31 on level 0.
|
||||
</p>
|
||||
<a name="N106A5"></a><a name="Positions"></a>
|
||||
<a name="N106CF"></a><a name="Positions"></a>
|
||||
<h3 class="boxed">Positions</h3>
|
||||
<p>
|
||||
The .prx file contains the lists of positions that
|
||||
|
@ -2023,7 +2067,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
Payload. If PayloadLength is not stored, then this Payload has the same
|
||||
length as the Payload at the previous position.
|
||||
</p>
|
||||
<a name="N106E1"></a><a name="Normalization Factors"></a>
|
||||
<a name="N1070B"></a><a name="Normalization Factors"></a>
|
||||
<h3 class="boxed">Normalization Factors</h3>
|
||||
<p>
|
||||
|
||||
|
@ -2127,7 +2171,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
<b>2.1 and above:</b>
|
||||
Separate norm files are created (when adequate) for both compound and non compound segments.
|
||||
</p>
|
||||
<a name="N1074A"></a><a name="Term Vectors"></a>
|
||||
<a name="N10774"></a><a name="Term Vectors"></a>
|
||||
<h3 class="boxed">Term Vectors</h3>
|
||||
<ol>
|
||||
|
||||
|
@ -2253,7 +2297,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
</li>
|
||||
|
||||
</ol>
|
||||
<a name="N107DD"></a><a name="Deleted Documents"></a>
|
||||
<a name="N10807"></a><a name="Deleted Documents"></a>
|
||||
<h3 class="boxed">Deleted Documents</h3>
|
||||
<p>The .del file is
|
||||
optional, and only exists when a segment contains deletions.
|
||||
|
@ -2325,7 +2369,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
</div>
|
||||
|
||||
|
||||
<a name="N10820"></a><a name="Limitations"></a>
|
||||
<a name="N1084A"></a><a name="Limitations"></a>
|
||||
<h2 class="boxed">Limitations</h2>
|
||||
<div class="section">
|
||||
<p>There
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -55,6 +55,13 @@ public class Token implements Cloneable {
|
|||
|
||||
Payload payload;
|
||||
|
||||
// For better indexing speed, use termBuffer (and
|
||||
// termBufferOffset/termBufferLength) instead of termText
|
||||
// to save new'ing a String per token
|
||||
char[] termBuffer;
|
||||
int termBufferOffset;
|
||||
int termBufferLength;
|
||||
|
||||
private int positionIncrement = 1;
|
||||
|
||||
/** Constructs a Token with the given term text, and start & end offsets.
|
||||
|
@ -65,6 +72,17 @@ public class Token implements Cloneable {
|
|||
endOffset = end;
|
||||
}
|
||||
|
||||
/** Constructs a Token with the given term text buffer
|
||||
* starting at offset for length lenth, and start & end offsets.
|
||||
* The type defaults to "word." */
|
||||
public Token(char[] text, int offset, int length, int start, int end) {
|
||||
termBuffer = text;
|
||||
termBufferOffset = offset;
|
||||
termBufferLength = length;
|
||||
startOffset = start;
|
||||
endOffset = end;
|
||||
}
|
||||
|
||||
/** Constructs a Token with the given text, start and end offsets, & type. */
|
||||
public Token(String text, int start, int end, String typ) {
|
||||
termText = text;
|
||||
|
@ -73,6 +91,19 @@ public class Token implements Cloneable {
|
|||
type = typ;
|
||||
}
|
||||
|
||||
/** Constructs a Token with the given term text buffer
|
||||
* starting at offset for length lenth, and start & end
|
||||
* offsets, & type. */
|
||||
public Token(char[] text, int offset, int length, int start, int end, String typ) {
|
||||
termBuffer = text;
|
||||
termBufferOffset = offset;
|
||||
termBufferLength = length;
|
||||
startOffset = start;
|
||||
endOffset = end;
|
||||
type = typ;
|
||||
}
|
||||
|
||||
|
||||
/** Set the position increment. This determines the position of this token
|
||||
* relative to the previous Token in a {@link TokenStream}, used in phrase
|
||||
* searching.
|
||||
|
@ -117,6 +148,19 @@ public class Token implements Cloneable {
|
|||
|
||||
/** Returns the Token's term text. */
|
||||
public final String termText() { return termText; }
|
||||
public final char[] termBuffer() { return termBuffer; }
|
||||
public final int termBufferOffset() { return termBufferOffset; }
|
||||
public final int termBufferLength() { return termBufferLength; }
|
||||
|
||||
public void setStartOffset(int offset) {this.startOffset = offset;}
|
||||
public void setEndOffset(int offset) {this.endOffset = offset;}
|
||||
|
||||
public final void setTermBuffer(char[] buffer, int offset, int length) {
|
||||
this.termBuffer = buffer;
|
||||
this.termBufferOffset = offset;
|
||||
this.termBufferLength = length;
|
||||
}
|
||||
|
||||
|
||||
/** Returns this Token's starting offset, the position of the first character
|
||||
corresponding to this token in the source text.
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -43,4 +43,9 @@ final class FieldInfo {
|
|||
this.omitNorms = omitNorms;
|
||||
this.storePayloads = storePayloads;
|
||||
}
|
||||
|
||||
public Object clone() {
|
||||
return new FieldInfo(name, isIndexed, number, storeTermVector, storePositionWithTermVector,
|
||||
storeOffsetWithTermVector, omitNorms, storePayloads);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -62,6 +62,20 @@ final class FieldInfos {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a deep clone of this FieldInfos instance.
|
||||
*/
|
||||
public Object clone() {
|
||||
FieldInfos fis = new FieldInfos();
|
||||
final int numField = byNumber.size();
|
||||
for(int i=0;i<numField;i++) {
|
||||
FieldInfo fi = (FieldInfo) ((FieldInfo) byNumber.get(i)).clone();
|
||||
fis.byNumber.add(fi);
|
||||
fis.byName.put(fi.name, fi);
|
||||
}
|
||||
return fis;
|
||||
}
|
||||
|
||||
/** Adds field info for a Document. */
|
||||
public void add(Document doc) {
|
||||
List fields = doc.getFields();
|
||||
|
|
|
@ -51,19 +51,39 @@ final class FieldsReader {
|
|||
private int size;
|
||||
private boolean closed;
|
||||
|
||||
// The docID offset where our docs begin in the index
|
||||
// file. This will be 0 if we have our own private file.
|
||||
private int docStoreOffset;
|
||||
|
||||
private ThreadLocal fieldsStreamTL = new ThreadLocal();
|
||||
|
||||
FieldsReader(Directory d, String segment, FieldInfos fn) throws IOException {
|
||||
this(d, segment, fn, BufferedIndexInput.BUFFER_SIZE);
|
||||
this(d, segment, fn, BufferedIndexInput.BUFFER_SIZE, -1, 0);
|
||||
}
|
||||
|
||||
FieldsReader(Directory d, String segment, FieldInfos fn, int readBufferSize) throws IOException {
|
||||
this(d, segment, fn, readBufferSize, -1, 0);
|
||||
}
|
||||
|
||||
FieldsReader(Directory d, String segment, FieldInfos fn, int readBufferSize, int docStoreOffset, int size) throws IOException {
|
||||
fieldInfos = fn;
|
||||
|
||||
cloneableFieldsStream = d.openInput(segment + ".fdt", readBufferSize);
|
||||
fieldsStream = (IndexInput)cloneableFieldsStream.clone();
|
||||
indexStream = d.openInput(segment + ".fdx", readBufferSize);
|
||||
size = (int) (indexStream.length() / 8);
|
||||
|
||||
if (docStoreOffset != -1) {
|
||||
// We read only a slice out of this shared fields file
|
||||
this.docStoreOffset = docStoreOffset;
|
||||
this.size = size;
|
||||
|
||||
// Verify the file is long enough to hold all of our
|
||||
// docs
|
||||
assert ((int) (indexStream.length()/8)) >= size + this.docStoreOffset;
|
||||
} else {
|
||||
this.docStoreOffset = 0;
|
||||
this.size = (int) (indexStream.length() / 8);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -100,7 +120,7 @@ final class FieldsReader {
|
|||
}
|
||||
|
||||
final Document doc(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException {
|
||||
indexStream.seek(n * 8L);
|
||||
indexStream.seek((n + docStoreOffset) * 8L);
|
||||
long position = indexStream.readLong();
|
||||
fieldsStream.seek(position);
|
||||
|
||||
|
|
|
@ -24,6 +24,7 @@ import java.util.zip.Deflater;
|
|||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Fieldable;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.RAMOutputStream;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
|
||||
final class FieldsWriter
|
||||
|
@ -38,15 +39,92 @@ final class FieldsWriter
|
|||
|
||||
private IndexOutput indexStream;
|
||||
|
||||
private boolean doClose;
|
||||
|
||||
FieldsWriter(Directory d, String segment, FieldInfos fn) throws IOException {
|
||||
fieldInfos = fn;
|
||||
fieldsStream = d.createOutput(segment + ".fdt");
|
||||
indexStream = d.createOutput(segment + ".fdx");
|
||||
doClose = true;
|
||||
}
|
||||
|
||||
FieldsWriter(IndexOutput fdx, IndexOutput fdt, FieldInfos fn) throws IOException {
|
||||
fieldInfos = fn;
|
||||
fieldsStream = fdt;
|
||||
indexStream = fdx;
|
||||
doClose = false;
|
||||
}
|
||||
|
||||
// Writes the contents of buffer into the fields stream
|
||||
// and adds a new entry for this document into the index
|
||||
// stream. This assumes the buffer was already written
|
||||
// in the correct fields format.
|
||||
void flushDocument(RAMOutputStream buffer) throws IOException {
|
||||
indexStream.writeLong(fieldsStream.getFilePointer());
|
||||
buffer.writeTo(fieldsStream);
|
||||
}
|
||||
|
||||
void flush() throws IOException {
|
||||
indexStream.flush();
|
||||
fieldsStream.flush();
|
||||
}
|
||||
|
||||
final void close() throws IOException {
|
||||
if (doClose) {
|
||||
fieldsStream.close();
|
||||
indexStream.close();
|
||||
}
|
||||
}
|
||||
|
||||
final void writeField(FieldInfo fi, Fieldable field) throws IOException {
|
||||
// if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode
|
||||
// and field.binaryValue() already returns the compressed value for a field
|
||||
// with isCompressed()==true, so we disable compression in that case
|
||||
boolean disableCompression = (field instanceof FieldsReader.FieldForMerge);
|
||||
fieldsStream.writeVInt(fi.number);
|
||||
byte bits = 0;
|
||||
if (field.isTokenized())
|
||||
bits |= FieldsWriter.FIELD_IS_TOKENIZED;
|
||||
if (field.isBinary())
|
||||
bits |= FieldsWriter.FIELD_IS_BINARY;
|
||||
if (field.isCompressed())
|
||||
bits |= FieldsWriter.FIELD_IS_COMPRESSED;
|
||||
|
||||
fieldsStream.writeByte(bits);
|
||||
|
||||
if (field.isCompressed()) {
|
||||
// compression is enabled for the current field
|
||||
byte[] data = null;
|
||||
|
||||
if (disableCompression) {
|
||||
// optimized case for merging, the data
|
||||
// is already compressed
|
||||
data = field.binaryValue();
|
||||
} else {
|
||||
// check if it is a binary field
|
||||
if (field.isBinary()) {
|
||||
data = compress(field.binaryValue());
|
||||
}
|
||||
else {
|
||||
data = compress(field.stringValue().getBytes("UTF-8"));
|
||||
}
|
||||
}
|
||||
final int len = data.length;
|
||||
fieldsStream.writeVInt(len);
|
||||
fieldsStream.writeBytes(data, len);
|
||||
}
|
||||
else {
|
||||
// compression is disabled for the current field
|
||||
if (field.isBinary()) {
|
||||
byte[] data = field.binaryValue();
|
||||
final int len = data.length;
|
||||
fieldsStream.writeVInt(len);
|
||||
fieldsStream.writeBytes(data, len);
|
||||
}
|
||||
else {
|
||||
fieldsStream.writeString(field.stringValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final void addDocument(Document doc) throws IOException {
|
||||
|
@ -64,57 +142,8 @@ final class FieldsWriter
|
|||
fieldIterator = doc.getFields().iterator();
|
||||
while (fieldIterator.hasNext()) {
|
||||
Fieldable field = (Fieldable) fieldIterator.next();
|
||||
// if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode
|
||||
// and field.binaryValue() already returns the compressed value for a field
|
||||
// with isCompressed()==true, so we disable compression in that case
|
||||
boolean disableCompression = (field instanceof FieldsReader.FieldForMerge);
|
||||
if (field.isStored()) {
|
||||
fieldsStream.writeVInt(fieldInfos.fieldNumber(field.name()));
|
||||
|
||||
byte bits = 0;
|
||||
if (field.isTokenized())
|
||||
bits |= FieldsWriter.FIELD_IS_TOKENIZED;
|
||||
if (field.isBinary())
|
||||
bits |= FieldsWriter.FIELD_IS_BINARY;
|
||||
if (field.isCompressed())
|
||||
bits |= FieldsWriter.FIELD_IS_COMPRESSED;
|
||||
|
||||
fieldsStream.writeByte(bits);
|
||||
|
||||
if (field.isCompressed()) {
|
||||
// compression is enabled for the current field
|
||||
byte[] data = null;
|
||||
|
||||
if (disableCompression) {
|
||||
// optimized case for merging, the data
|
||||
// is already compressed
|
||||
data = field.binaryValue();
|
||||
} else {
|
||||
// check if it is a binary field
|
||||
if (field.isBinary()) {
|
||||
data = compress(field.binaryValue());
|
||||
}
|
||||
else {
|
||||
data = compress(field.stringValue().getBytes("UTF-8"));
|
||||
}
|
||||
}
|
||||
final int len = data.length;
|
||||
fieldsStream.writeVInt(len);
|
||||
fieldsStream.writeBytes(data, len);
|
||||
}
|
||||
else {
|
||||
// compression is disabled for the current field
|
||||
if (field.isBinary()) {
|
||||
byte[] data = field.binaryValue();
|
||||
final int len = data.length;
|
||||
fieldsStream.writeVInt(len);
|
||||
fieldsStream.writeBytes(data, len);
|
||||
}
|
||||
else {
|
||||
fieldsStream.writeString(field.stringValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
if (field.isStored())
|
||||
writeField(fieldInfos.fieldInfo(field.name()), field);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -97,6 +97,7 @@ final class IndexFileDeleter {
|
|||
private PrintStream infoStream;
|
||||
private Directory directory;
|
||||
private IndexDeletionPolicy policy;
|
||||
private DocumentsWriter docWriter;
|
||||
|
||||
void setInfoStream(PrintStream infoStream) {
|
||||
this.infoStream = infoStream;
|
||||
|
@ -116,10 +117,12 @@ final class IndexFileDeleter {
|
|||
* @throws CorruptIndexException if the index is corrupt
|
||||
* @throws IOException if there is a low-level IO error
|
||||
*/
|
||||
public IndexFileDeleter(Directory directory, IndexDeletionPolicy policy, SegmentInfos segmentInfos, PrintStream infoStream)
|
||||
public IndexFileDeleter(Directory directory, IndexDeletionPolicy policy, SegmentInfos segmentInfos, PrintStream infoStream, DocumentsWriter docWriter)
|
||||
throws CorruptIndexException, IOException {
|
||||
|
||||
this.docWriter = docWriter;
|
||||
this.infoStream = infoStream;
|
||||
|
||||
this.policy = policy;
|
||||
this.directory = directory;
|
||||
|
||||
|
@ -294,7 +297,7 @@ final class IndexFileDeleter {
|
|||
public void checkpoint(SegmentInfos segmentInfos, boolean isCommit) throws IOException {
|
||||
|
||||
if (infoStream != null) {
|
||||
message("now checkpoint \"" + segmentInfos.getCurrentSegmentFileName() + "\" [isCommit = " + isCommit + "]");
|
||||
message("now checkpoint \"" + segmentInfos.getCurrentSegmentFileName() + "\" [" + segmentInfos.size() + " segments " + "; isCommit = " + isCommit + "]");
|
||||
}
|
||||
|
||||
// Try again now to delete any previously un-deletable
|
||||
|
@ -310,6 +313,8 @@ final class IndexFileDeleter {
|
|||
|
||||
// Incref the files:
|
||||
incRef(segmentInfos, isCommit);
|
||||
if (docWriter != null)
|
||||
incRef(docWriter.files());
|
||||
|
||||
if (isCommit) {
|
||||
// Append to our commits list:
|
||||
|
@ -325,9 +330,8 @@ final class IndexFileDeleter {
|
|||
// DecRef old files from the last checkpoint, if any:
|
||||
int size = lastFiles.size();
|
||||
if (size > 0) {
|
||||
for(int i=0;i<size;i++) {
|
||||
for(int i=0;i<size;i++)
|
||||
decRef((List) lastFiles.get(i));
|
||||
}
|
||||
lastFiles.clear();
|
||||
}
|
||||
|
||||
|
@ -340,6 +344,8 @@ final class IndexFileDeleter {
|
|||
lastFiles.add(segmentInfo.files());
|
||||
}
|
||||
}
|
||||
if (docWriter != null)
|
||||
lastFiles.add(docWriter.files());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -38,18 +38,54 @@ final class IndexFileNames {
|
|||
/** Extension of norms file */
|
||||
static final String NORMS_EXTENSION = "nrm";
|
||||
|
||||
/** Extension of freq postings file */
|
||||
static final String FREQ_EXTENSION = "frq";
|
||||
|
||||
/** Extension of prox postings file */
|
||||
static final String PROX_EXTENSION = "prx";
|
||||
|
||||
/** Extension of terms file */
|
||||
static final String TERMS_EXTENSION = "tis";
|
||||
|
||||
/** Extension of terms index file */
|
||||
static final String TERMS_INDEX_EXTENSION = "tii";
|
||||
|
||||
/** Extension of stored fields index file */
|
||||
static final String FIELDS_INDEX_EXTENSION = "fdx";
|
||||
|
||||
/** Extension of stored fields file */
|
||||
static final String FIELDS_EXTENSION = "fdt";
|
||||
|
||||
/** Extension of vectors fields file */
|
||||
static final String VECTORS_FIELDS_EXTENSION = "tvf";
|
||||
|
||||
/** Extension of vectors documents file */
|
||||
static final String VECTORS_DOCUMENTS_EXTENSION = "tvd";
|
||||
|
||||
/** Extension of vectors index file */
|
||||
static final String VECTORS_INDEX_EXTENSION = "tvx";
|
||||
|
||||
/** Extension of compound file */
|
||||
static final String COMPOUND_FILE_EXTENSION = "cfs";
|
||||
|
||||
/** Extension of compound file for doc store files*/
|
||||
static final String COMPOUND_FILE_STORE_EXTENSION = "cfx";
|
||||
|
||||
/** Extension of deletes */
|
||||
static final String DELETES_EXTENSION = "del";
|
||||
|
||||
/** Extension of field infos */
|
||||
static final String FIELD_INFOS_EXTENSION = "fnm";
|
||||
|
||||
/** Extension of plain norms */
|
||||
static final String PLAIN_NORMS_EXTENSION = "f";
|
||||
|
||||
/** Extension of separate norms */
|
||||
static final String SEPARATE_NORMS_EXTENSION = "s";
|
||||
|
||||
/** Extension of gen file */
|
||||
static final String GEN_EXTENSION = "gen";
|
||||
|
||||
/**
|
||||
* This array contains all filename extensions used by
|
||||
* Lucene's index files, with two exceptions, namely the
|
||||
|
@ -59,25 +95,72 @@ final class IndexFileNames {
|
|||
* filename extension.
|
||||
*/
|
||||
static final String INDEX_EXTENSIONS[] = new String[] {
|
||||
"cfs", "fnm", "fdx", "fdt", "tii", "tis", "frq", "prx", "del",
|
||||
"tvx", "tvd", "tvf", "gen", "nrm"
|
||||
COMPOUND_FILE_EXTENSION,
|
||||
FIELD_INFOS_EXTENSION,
|
||||
FIELDS_INDEX_EXTENSION,
|
||||
FIELDS_EXTENSION,
|
||||
TERMS_INDEX_EXTENSION,
|
||||
TERMS_EXTENSION,
|
||||
FREQ_EXTENSION,
|
||||
PROX_EXTENSION,
|
||||
DELETES_EXTENSION,
|
||||
VECTORS_INDEX_EXTENSION,
|
||||
VECTORS_DOCUMENTS_EXTENSION,
|
||||
VECTORS_FIELDS_EXTENSION,
|
||||
GEN_EXTENSION,
|
||||
NORMS_EXTENSION,
|
||||
COMPOUND_FILE_STORE_EXTENSION,
|
||||
};
|
||||
|
||||
/** File extensions that are added to a compound file
|
||||
* (same as above, minus "del", "gen", "cfs"). */
|
||||
static final String[] INDEX_EXTENSIONS_IN_COMPOUND_FILE = new String[] {
|
||||
"fnm", "fdx", "fdt", "tii", "tis", "frq", "prx",
|
||||
"tvx", "tvd", "tvf", "nrm"
|
||||
FIELD_INFOS_EXTENSION,
|
||||
FIELDS_INDEX_EXTENSION,
|
||||
FIELDS_EXTENSION,
|
||||
TERMS_INDEX_EXTENSION,
|
||||
TERMS_EXTENSION,
|
||||
FREQ_EXTENSION,
|
||||
PROX_EXTENSION,
|
||||
VECTORS_INDEX_EXTENSION,
|
||||
VECTORS_DOCUMENTS_EXTENSION,
|
||||
VECTORS_FIELDS_EXTENSION,
|
||||
NORMS_EXTENSION
|
||||
};
|
||||
|
||||
static final String[] STORE_INDEX_EXTENSIONS = new String[] {
|
||||
VECTORS_INDEX_EXTENSION,
|
||||
VECTORS_FIELDS_EXTENSION,
|
||||
VECTORS_DOCUMENTS_EXTENSION,
|
||||
FIELDS_INDEX_EXTENSION,
|
||||
FIELDS_EXTENSION
|
||||
};
|
||||
|
||||
static final String[] NON_STORE_INDEX_EXTENSIONS = new String[] {
|
||||
FIELD_INFOS_EXTENSION,
|
||||
FREQ_EXTENSION,
|
||||
PROX_EXTENSION,
|
||||
TERMS_EXTENSION,
|
||||
TERMS_INDEX_EXTENSION,
|
||||
NORMS_EXTENSION
|
||||
};
|
||||
|
||||
/** File extensions of old-style index files */
|
||||
static final String COMPOUND_EXTENSIONS[] = new String[] {
|
||||
"fnm", "frq", "prx", "fdx", "fdt", "tii", "tis"
|
||||
FIELD_INFOS_EXTENSION,
|
||||
FREQ_EXTENSION,
|
||||
PROX_EXTENSION,
|
||||
FIELDS_INDEX_EXTENSION,
|
||||
FIELDS_EXTENSION,
|
||||
TERMS_INDEX_EXTENSION,
|
||||
TERMS_EXTENSION
|
||||
};
|
||||
|
||||
/** File extensions for term vector support */
|
||||
static final String VECTOR_EXTENSIONS[] = new String[] {
|
||||
"tvx", "tvd", "tvf"
|
||||
VECTORS_INDEX_EXTENSION,
|
||||
VECTORS_DOCUMENTS_EXTENSION,
|
||||
VECTORS_FIELDS_EXTENSION
|
||||
};
|
||||
|
||||
/**
|
||||
|
|
|
@ -203,7 +203,8 @@ public class IndexModifier {
|
|||
indexWriter = new IndexWriter(directory, analyzer, false);
|
||||
indexWriter.setInfoStream(infoStream);
|
||||
indexWriter.setUseCompoundFile(useCompoundFile);
|
||||
indexWriter.setMaxBufferedDocs(maxBufferedDocs);
|
||||
if (maxBufferedDocs != 0)
|
||||
indexWriter.setMaxBufferedDocs(maxBufferedDocs);
|
||||
indexWriter.setMaxFieldLength(maxFieldLength);
|
||||
indexWriter.setMergeFactor(mergeFactor);
|
||||
}
|
||||
|
|
|
@ -783,7 +783,7 @@ public abstract class IndexReader {
|
|||
// KeepOnlyLastCommitDeleter:
|
||||
IndexFileDeleter deleter = new IndexFileDeleter(directory,
|
||||
deletionPolicy == null ? new KeepOnlyLastCommitDeletionPolicy() : deletionPolicy,
|
||||
segmentInfos, null);
|
||||
segmentInfos, null, null);
|
||||
|
||||
// Checkpoint the state we are about to change, in
|
||||
// case we have to roll back:
|
||||
|
|
|
@ -61,14 +61,19 @@ import java.util.Map.Entry;
|
|||
When finished adding, deleting and updating documents, <a href="#close()"><b>close</b></a> should be called.</p>
|
||||
|
||||
<p>These changes are buffered in memory and periodically
|
||||
flushed to the {@link Directory} (during the above method calls). A flush is triggered when there are
|
||||
enough buffered deletes (see {@link
|
||||
#setMaxBufferedDeleteTerms}) or enough added documents
|
||||
(see {@link #setMaxBufferedDocs}) since the last flush,
|
||||
whichever is sooner. You can also force a flush by
|
||||
calling {@link #flush}. When a flush occurs, both pending
|
||||
deletes and added documents are flushed to the index. A
|
||||
flush may also trigger one or more segment merges.</p>
|
||||
flushed to the {@link Directory} (during the above method
|
||||
calls). A flush is triggered when there are enough
|
||||
buffered deletes (see {@link #setMaxBufferedDeleteTerms})
|
||||
or enough added documents since the last flush, whichever
|
||||
is sooner. For the added documents, flushing is triggered
|
||||
either by RAM usage of the documents (see {@link
|
||||
#setRAMBufferSizeMB}) or the number of added documents
|
||||
(this is the default; see {@link #setMaxBufferedDocs}).
|
||||
For best indexing speed you should flush by RAM usage with
|
||||
a large RAM buffer. You can also force a flush by calling
|
||||
{@link #flush}. When a flush occurs, both pending deletes
|
||||
and added documents are flushed to the index. A flush may
|
||||
also trigger one or more segment merges.</p>
|
||||
|
||||
<a name="autoCommit"></a>
|
||||
<p>The optional <code>autoCommit</code> argument to the
|
||||
|
@ -181,7 +186,20 @@ public class IndexWriter {
|
|||
/**
|
||||
* Default value is 10. Change using {@link #setMaxBufferedDocs(int)}.
|
||||
*/
|
||||
|
||||
public final static int DEFAULT_MAX_BUFFERED_DOCS = 10;
|
||||
/* new merge policy
|
||||
public final static int DEFAULT_MAX_BUFFERED_DOCS = 0;
|
||||
*/
|
||||
|
||||
/**
|
||||
* Default value is 0 MB (which means flush only by doc
|
||||
* count). Change using {@link #setRAMBufferSizeMB}.
|
||||
*/
|
||||
public final static double DEFAULT_RAM_BUFFER_SIZE_MB = 0.0;
|
||||
/* new merge policy
|
||||
public final static double DEFAULT_RAM_BUFFER_SIZE_MB = 16.0;
|
||||
*/
|
||||
|
||||
/**
|
||||
* Default value is 1000. Change using {@link #setMaxBufferedDeleteTerms(int)}.
|
||||
|
@ -224,8 +242,7 @@ public class IndexWriter {
|
|||
private boolean autoCommit = true; // false if we should commit only on close
|
||||
|
||||
SegmentInfos segmentInfos = new SegmentInfos(); // the segments
|
||||
SegmentInfos ramSegmentInfos = new SegmentInfos(); // the segments in ramDirectory
|
||||
private final RAMDirectory ramDirectory = new RAMDirectory(); // for temp segs
|
||||
private DocumentsWriter docWriter;
|
||||
private IndexFileDeleter deleter;
|
||||
|
||||
private Lock writeLock;
|
||||
|
@ -621,11 +638,14 @@ public class IndexWriter {
|
|||
rollbackSegmentInfos = (SegmentInfos) segmentInfos.clone();
|
||||
}
|
||||
|
||||
docWriter = new DocumentsWriter(directory, this);
|
||||
docWriter.setInfoStream(infoStream);
|
||||
|
||||
// Default deleter (for backwards compatibility) is
|
||||
// KeepOnlyLastCommitDeleter:
|
||||
deleter = new IndexFileDeleter(directory,
|
||||
deletionPolicy == null ? new KeepOnlyLastCommitDeletionPolicy() : deletionPolicy,
|
||||
segmentInfos, infoStream);
|
||||
segmentInfos, infoStream, docWriter);
|
||||
|
||||
} catch (IOException e) {
|
||||
this.writeLock.release();
|
||||
|
@ -683,31 +703,64 @@ public class IndexWriter {
|
|||
return maxFieldLength;
|
||||
}
|
||||
|
||||
/** Determines the minimal number of documents required before the buffered
|
||||
* in-memory documents are merged and a new Segment is created.
|
||||
* Since Documents are merged in a {@link org.apache.lucene.store.RAMDirectory},
|
||||
* large value gives faster indexing. At the same time, mergeFactor limits
|
||||
* the number of files open in a FSDirectory.
|
||||
/** Determines the minimal number of documents required
|
||||
* before the buffered in-memory documents are flushed as
|
||||
* a new Segment. Large values generally gives faster
|
||||
* indexing.
|
||||
*
|
||||
* <p> The default value is 10.
|
||||
* <p>When this is set, the writer will flush every
|
||||
* maxBufferedDocs added documents and never flush by RAM
|
||||
* usage.</p>
|
||||
*
|
||||
* @throws IllegalArgumentException if maxBufferedDocs is smaller than 2
|
||||
* <p> The default value is 0 (writer flushes by RAM
|
||||
* usage).</p>
|
||||
*
|
||||
* @throws IllegalArgumentException if maxBufferedDocs is
|
||||
* smaller than 2
|
||||
* @see #setRAMBufferSizeMB
|
||||
*/
|
||||
public void setMaxBufferedDocs(int maxBufferedDocs) {
|
||||
ensureOpen();
|
||||
if (maxBufferedDocs < 2)
|
||||
throw new IllegalArgumentException("maxBufferedDocs must at least be 2");
|
||||
this.minMergeDocs = maxBufferedDocs;
|
||||
docWriter.setMaxBufferedDocs(maxBufferedDocs);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of buffered added documents that will
|
||||
* Returns 0 if this writer is flushing by RAM usage, else
|
||||
* returns the number of buffered added documents that will
|
||||
* trigger a flush.
|
||||
* @see #setMaxBufferedDocs
|
||||
*/
|
||||
public int getMaxBufferedDocs() {
|
||||
ensureOpen();
|
||||
return minMergeDocs;
|
||||
return docWriter.getMaxBufferedDocs();
|
||||
}
|
||||
|
||||
/** Determines the amount of RAM that may be used for
|
||||
* buffering added documents before they are flushed as a
|
||||
* new Segment. Generally for faster indexing performance
|
||||
* it's best to flush by RAM usage instead of document
|
||||
* count and use as large a RAM buffer as you can.
|
||||
*
|
||||
* <p>When this is set, the writer will flush whenever
|
||||
* buffered documents use this much RAM.</p>
|
||||
*
|
||||
* <p> The default value is {@link #DEFAULT_RAM_BUFFER_SIZE_MB}.</p>
|
||||
*/
|
||||
public void setRAMBufferSizeMB(double mb) {
|
||||
if (mb <= 0.0)
|
||||
throw new IllegalArgumentException("ramBufferSize should be > 0.0 MB");
|
||||
docWriter.setRAMBufferSizeMB(mb);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns 0.0 if this writer is flushing by document
|
||||
* count, else returns the value set by {@link
|
||||
* #setRAMBufferSizeMB}.
|
||||
*/
|
||||
public double getRAMBufferSizeMB() {
|
||||
return docWriter.getRAMBufferSizeMB();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -788,6 +841,7 @@ public class IndexWriter {
|
|||
public void setInfoStream(PrintStream infoStream) {
|
||||
ensureOpen();
|
||||
this.infoStream = infoStream;
|
||||
docWriter.setInfoStream(infoStream);
|
||||
deleter.setInfoStream(infoStream);
|
||||
}
|
||||
|
||||
|
@ -871,7 +925,7 @@ public class IndexWriter {
|
|||
*/
|
||||
public synchronized void close() throws CorruptIndexException, IOException {
|
||||
if (!closed) {
|
||||
flushRamSegments();
|
||||
flush(true, true);
|
||||
|
||||
if (commitPending) {
|
||||
segmentInfos.write(directory); // now commit changes
|
||||
|
@ -880,18 +934,79 @@ public class IndexWriter {
|
|||
rollbackSegmentInfos = null;
|
||||
}
|
||||
|
||||
ramDirectory.close();
|
||||
if (writeLock != null) {
|
||||
writeLock.release(); // release write lock
|
||||
writeLock = null;
|
||||
}
|
||||
closed = true;
|
||||
docWriter = null;
|
||||
|
||||
if(closeDir)
|
||||
directory.close();
|
||||
}
|
||||
}
|
||||
|
||||
/** Tells the docWriter to close its currently open shared
|
||||
* doc stores (stored fields & vectors files). */
|
||||
private void flushDocStores() throws IOException {
|
||||
|
||||
List files = docWriter.files();
|
||||
|
||||
if (files.size() > 0) {
|
||||
String docStoreSegment;
|
||||
|
||||
boolean success = false;
|
||||
try {
|
||||
docStoreSegment = docWriter.closeDocStore();
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success)
|
||||
docWriter.abort();
|
||||
}
|
||||
|
||||
if (useCompoundFile && docStoreSegment != null) {
|
||||
// Now build compound doc store file
|
||||
checkpoint();
|
||||
|
||||
success = false;
|
||||
|
||||
final int numSegments = segmentInfos.size();
|
||||
|
||||
try {
|
||||
CompoundFileWriter cfsWriter = new CompoundFileWriter(directory, docStoreSegment + "." + IndexFileNames.COMPOUND_FILE_STORE_EXTENSION);
|
||||
final int size = files.size();
|
||||
for(int i=0;i<size;i++)
|
||||
cfsWriter.addFile((String) files.get(i));
|
||||
|
||||
// Perform the merge
|
||||
cfsWriter.close();
|
||||
|
||||
for(int i=0;i<numSegments;i++) {
|
||||
SegmentInfo si = segmentInfos.info(i);
|
||||
if (si.getDocStoreOffset() != -1 &&
|
||||
si.getDocStoreSegment().equals(docStoreSegment))
|
||||
si.setDocStoreIsCompoundFile(true);
|
||||
}
|
||||
checkpoint();
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
// Rollback to no compound file
|
||||
for(int i=0;i<numSegments;i++) {
|
||||
SegmentInfo si = segmentInfos.info(i);
|
||||
if (si.getDocStoreOffset() != -1 &&
|
||||
si.getDocStoreSegment().equals(docStoreSegment))
|
||||
si.setDocStoreIsCompoundFile(false);
|
||||
}
|
||||
deleter.refresh();
|
||||
}
|
||||
}
|
||||
|
||||
deleter.checkpoint(segmentInfos, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Release the write lock, if needed. */
|
||||
protected void finalize() throws Throwable {
|
||||
try {
|
||||
|
@ -916,11 +1031,10 @@ public class IndexWriter {
|
|||
return analyzer;
|
||||
}
|
||||
|
||||
|
||||
/** Returns the number of documents currently in this index. */
|
||||
public synchronized int docCount() {
|
||||
ensureOpen();
|
||||
int count = ramSegmentInfos.size();
|
||||
int count = docWriter.getNumDocsInRAM();
|
||||
for (int i = 0; i < segmentInfos.size(); i++) {
|
||||
SegmentInfo si = segmentInfos.info(i);
|
||||
count += si.docCount;
|
||||
|
@ -998,22 +1112,8 @@ public class IndexWriter {
|
|||
*/
|
||||
public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException, IOException {
|
||||
ensureOpen();
|
||||
SegmentInfo newSegmentInfo = buildSingleDocSegment(doc, analyzer);
|
||||
synchronized (this) {
|
||||
ramSegmentInfos.addElement(newSegmentInfo);
|
||||
maybeFlushRamSegments();
|
||||
}
|
||||
}
|
||||
|
||||
SegmentInfo buildSingleDocSegment(Document doc, Analyzer analyzer)
|
||||
throws CorruptIndexException, IOException {
|
||||
DocumentWriter dw = new DocumentWriter(ramDirectory, analyzer, this);
|
||||
dw.setInfoStream(infoStream);
|
||||
String segmentName = newRamSegmentName();
|
||||
dw.addDocument(segmentName, doc);
|
||||
SegmentInfo si = new SegmentInfo(segmentName, 1, ramDirectory, false, false);
|
||||
si.setNumFields(dw.getNumFields());
|
||||
return si;
|
||||
if (docWriter.addDocument(doc, analyzer))
|
||||
flush(true, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1025,7 +1125,7 @@ public class IndexWriter {
|
|||
public synchronized void deleteDocuments(Term term) throws CorruptIndexException, IOException {
|
||||
ensureOpen();
|
||||
bufferDeleteTerm(term);
|
||||
maybeFlushRamSegments();
|
||||
maybeFlush();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1041,7 +1141,7 @@ public class IndexWriter {
|
|||
for (int i = 0; i < terms.length; i++) {
|
||||
bufferDeleteTerm(terms[i]);
|
||||
}
|
||||
maybeFlushRamSegments();
|
||||
maybeFlush();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1077,16 +1177,13 @@ public class IndexWriter {
|
|||
public void updateDocument(Term term, Document doc, Analyzer analyzer)
|
||||
throws CorruptIndexException, IOException {
|
||||
ensureOpen();
|
||||
SegmentInfo newSegmentInfo = buildSingleDocSegment(doc, analyzer);
|
||||
synchronized (this) {
|
||||
bufferDeleteTerm(term);
|
||||
ramSegmentInfos.addElement(newSegmentInfo);
|
||||
maybeFlushRamSegments();
|
||||
}
|
||||
}
|
||||
|
||||
final synchronized String newRamSegmentName() {
|
||||
return "_ram_" + Integer.toString(ramSegmentInfos.counter++, Character.MAX_RADIX);
|
||||
if (docWriter.addDocument(doc, analyzer))
|
||||
flush(true, false);
|
||||
else
|
||||
maybeFlush();
|
||||
}
|
||||
|
||||
// for test purpose
|
||||
|
@ -1095,8 +1192,8 @@ public class IndexWriter {
|
|||
}
|
||||
|
||||
// for test purpose
|
||||
final synchronized int getRamSegmentCount(){
|
||||
return ramSegmentInfos.size();
|
||||
final synchronized int getNumBufferedDocuments(){
|
||||
return docWriter.getNumDocsInRAM();
|
||||
}
|
||||
|
||||
// for test purpose
|
||||
|
@ -1108,7 +1205,7 @@ public class IndexWriter {
|
|||
}
|
||||
}
|
||||
|
||||
final synchronized String newSegmentName() {
|
||||
final String newSegmentName() {
|
||||
return "_" + Integer.toString(segmentInfos.counter++, Character.MAX_RADIX);
|
||||
}
|
||||
|
||||
|
@ -1125,17 +1222,10 @@ public class IndexWriter {
|
|||
*/
|
||||
private int mergeFactor = DEFAULT_MERGE_FACTOR;
|
||||
|
||||
/** Determines the minimal number of documents required before the buffered
|
||||
* in-memory documents are merging and a new Segment is created.
|
||||
* Since Documents are merged in a {@link org.apache.lucene.store.RAMDirectory},
|
||||
* large value gives faster indexing. At the same time, mergeFactor limits
|
||||
* the number of files open in a FSDirectory.
|
||||
*
|
||||
* <p> The default value is {@link #DEFAULT_MAX_BUFFERED_DOCS}.
|
||||
|
||||
/** Determines amount of RAM usage by the buffered docs at
|
||||
* which point we trigger a flush to the index.
|
||||
*/
|
||||
private int minMergeDocs = DEFAULT_MAX_BUFFERED_DOCS;
|
||||
|
||||
private double ramBufferSize = DEFAULT_RAM_BUFFER_SIZE_MB*1024F*1024F;
|
||||
|
||||
/** Determines the largest number of documents ever merged by addDocument().
|
||||
* Small values (e.g., less than 10,000) are best for interactive indexing,
|
||||
|
@ -1151,6 +1241,7 @@ public class IndexWriter {
|
|||
|
||||
*/
|
||||
private PrintStream infoStream = null;
|
||||
|
||||
private static PrintStream defaultInfoStream = null;
|
||||
|
||||
/** Merges all segments together into a single segment,
|
||||
|
@ -1219,16 +1310,16 @@ public class IndexWriter {
|
|||
*/
|
||||
public synchronized void optimize() throws CorruptIndexException, IOException {
|
||||
ensureOpen();
|
||||
flushRamSegments();
|
||||
flush();
|
||||
while (segmentInfos.size() > 1 ||
|
||||
(segmentInfos.size() == 1 &&
|
||||
(SegmentReader.hasDeletions(segmentInfos.info(0)) ||
|
||||
SegmentReader.hasSeparateNorms(segmentInfos.info(0)) ||
|
||||
segmentInfos.info(0).dir != directory ||
|
||||
(useCompoundFile &&
|
||||
(!SegmentReader.usesCompoundFile(segmentInfos.info(0))))))) {
|
||||
!segmentInfos.info(0).getUseCompoundFile())))) {
|
||||
int minSegment = segmentInfos.size() - mergeFactor;
|
||||
mergeSegments(segmentInfos, minSegment < 0 ? 0 : minSegment, segmentInfos.size());
|
||||
mergeSegments(minSegment < 0 ? 0 : minSegment, segmentInfos.size());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1245,7 +1336,7 @@ public class IndexWriter {
|
|||
localRollbackSegmentInfos = (SegmentInfos) segmentInfos.clone();
|
||||
localAutoCommit = autoCommit;
|
||||
if (localAutoCommit) {
|
||||
flushRamSegments();
|
||||
flush();
|
||||
// Turn off auto-commit during our local transaction:
|
||||
autoCommit = false;
|
||||
} else
|
||||
|
@ -1335,16 +1426,18 @@ public class IndexWriter {
|
|||
segmentInfos.clear();
|
||||
segmentInfos.addAll(rollbackSegmentInfos);
|
||||
|
||||
docWriter.abort();
|
||||
|
||||
// Ask deleter to locate unreferenced files & remove
|
||||
// them:
|
||||
deleter.checkpoint(segmentInfos, false);
|
||||
deleter.refresh();
|
||||
|
||||
ramSegmentInfos = new SegmentInfos();
|
||||
bufferedDeleteTerms.clear();
|
||||
numBufferedDeleteTerms = 0;
|
||||
|
||||
commitPending = false;
|
||||
docWriter.abort();
|
||||
close();
|
||||
|
||||
} else {
|
||||
|
@ -1439,7 +1532,7 @@ public class IndexWriter {
|
|||
for (int base = start; base < segmentInfos.size(); base++) {
|
||||
int end = Math.min(segmentInfos.size(), base+mergeFactor);
|
||||
if (end-base > 1) {
|
||||
mergeSegments(segmentInfos, base, end);
|
||||
mergeSegments(base, end);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1479,7 +1572,7 @@ public class IndexWriter {
|
|||
// segments in S may not since they could come from multiple indexes.
|
||||
// Here is the merge algorithm for addIndexesNoOptimize():
|
||||
//
|
||||
// 1 Flush ram segments.
|
||||
// 1 Flush ram.
|
||||
// 2 Consider a combined sequence with segments from T followed
|
||||
// by segments from S (same as current addIndexes(Directory[])).
|
||||
// 3 Assume the highest level for segments in S is h. Call
|
||||
|
@ -1500,13 +1593,18 @@ public class IndexWriter {
|
|||
// copy a segment, which may cause doc count to change because deleted
|
||||
// docs are garbage collected.
|
||||
|
||||
// 1 flush ram segments
|
||||
// 1 flush ram
|
||||
|
||||
ensureOpen();
|
||||
flushRamSegments();
|
||||
flush();
|
||||
|
||||
// 2 copy segment infos and find the highest level from dirs
|
||||
int startUpperBound = minMergeDocs;
|
||||
int startUpperBound = docWriter.getMaxBufferedDocs();
|
||||
|
||||
/* new merge policy
|
||||
if (startUpperBound == 0)
|
||||
startUpperBound = 10;
|
||||
*/
|
||||
|
||||
boolean success = false;
|
||||
|
||||
|
@ -1566,7 +1664,7 @@ public class IndexWriter {
|
|||
|
||||
// copy those segments from S
|
||||
for (int i = segmentCount - numSegmentsToCopy; i < segmentCount; i++) {
|
||||
mergeSegments(segmentInfos, i, i + 1);
|
||||
mergeSegments(i, i + 1);
|
||||
}
|
||||
if (checkNonDecreasingLevels(segmentCount - numSegmentsToCopy)) {
|
||||
success = true;
|
||||
|
@ -1575,7 +1673,7 @@ public class IndexWriter {
|
|||
}
|
||||
|
||||
// invariants do not hold, simply merge those segments
|
||||
mergeSegments(segmentInfos, segmentCount - numTailSegments, segmentCount);
|
||||
mergeSegments(segmentCount - numTailSegments, segmentCount);
|
||||
|
||||
// maybe merge segments again if necessary
|
||||
if (segmentInfos.info(segmentInfos.size() - 1).docCount > startUpperBound) {
|
||||
|
@ -1637,7 +1735,8 @@ public class IndexWriter {
|
|||
}
|
||||
|
||||
segmentInfos.setSize(0); // pop old infos & add new
|
||||
info = new SegmentInfo(mergedName, docCount, directory, false, true);
|
||||
info = new SegmentInfo(mergedName, docCount, directory, false, true,
|
||||
-1, null, false);
|
||||
segmentInfos.addElement(info);
|
||||
|
||||
success = true;
|
||||
|
@ -1720,27 +1819,17 @@ public class IndexWriter {
|
|||
* buffered added documents or buffered deleted terms are
|
||||
* large enough.
|
||||
*/
|
||||
protected final void maybeFlushRamSegments() throws CorruptIndexException, IOException {
|
||||
// A flush is triggered if enough new documents are buffered or
|
||||
// if enough delete terms are buffered
|
||||
if (ramSegmentInfos.size() >= minMergeDocs || numBufferedDeleteTerms >= maxBufferedDeleteTerms) {
|
||||
flushRamSegments();
|
||||
}
|
||||
protected final synchronized void maybeFlush() throws CorruptIndexException, IOException {
|
||||
// We only check for flush due to number of buffered
|
||||
// delete terms, because triggering of a flush due to
|
||||
// too many added documents is handled by
|
||||
// DocumentsWriter
|
||||
if (numBufferedDeleteTerms >= maxBufferedDeleteTerms && docWriter.setFlushPending())
|
||||
flush(true, false);
|
||||
}
|
||||
|
||||
/** Expert: Flushes all RAM-resident segments (buffered documents), then may merge segments. */
|
||||
private final synchronized void flushRamSegments() throws CorruptIndexException, IOException {
|
||||
flushRamSegments(true);
|
||||
}
|
||||
|
||||
/** Expert: Flushes all RAM-resident segments (buffered documents),
|
||||
* then may merge segments if triggerMerge==true. */
|
||||
protected final synchronized void flushRamSegments(boolean triggerMerge)
|
||||
throws CorruptIndexException, IOException {
|
||||
if (ramSegmentInfos.size() > 0 || bufferedDeleteTerms.size() > 0) {
|
||||
mergeSegments(ramSegmentInfos, 0, ramSegmentInfos.size());
|
||||
if (triggerMerge) maybeMergeSegments(minMergeDocs);
|
||||
}
|
||||
public final synchronized void flush() throws CorruptIndexException, IOException {
|
||||
flush(true, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1751,9 +1840,158 @@ public class IndexWriter {
|
|||
* @throws CorruptIndexException if the index is corrupt
|
||||
* @throws IOException if there is a low-level IO error
|
||||
*/
|
||||
public final synchronized void flush() throws CorruptIndexException, IOException {
|
||||
public final synchronized void flush(boolean triggerMerge, boolean flushDocStores) throws CorruptIndexException, IOException {
|
||||
ensureOpen();
|
||||
flushRamSegments();
|
||||
|
||||
// Make sure no threads are actively adding a document
|
||||
docWriter.pauseAllThreads();
|
||||
|
||||
try {
|
||||
|
||||
SegmentInfo newSegment = null;
|
||||
|
||||
final int numDocs = docWriter.getNumDocsInRAM();
|
||||
|
||||
// Always flush docs if there are any
|
||||
boolean flushDocs = numDocs > 0;
|
||||
|
||||
// With autoCommit=true we always must flush the doc
|
||||
// stores when we flush
|
||||
flushDocStores |= autoCommit;
|
||||
String docStoreSegment = docWriter.getDocStoreSegment();
|
||||
if (docStoreSegment == null)
|
||||
flushDocStores = false;
|
||||
|
||||
// Always flush deletes if there are any delete terms.
|
||||
// TODO: when autoCommit=false we don't have to flush
|
||||
// deletes with every flushed segment; we can save
|
||||
// CPU/IO by buffering longer & flushing deletes only
|
||||
// when they are full or writer is being closed. We
|
||||
// have to fix the "applyDeletesSelectively" logic to
|
||||
// apply to more than just the last flushed segment
|
||||
boolean flushDeletes = bufferedDeleteTerms.size() > 0;
|
||||
|
||||
if (infoStream != null)
|
||||
infoStream.println(" flush: flushDocs=" + flushDocs +
|
||||
" flushDeletes=" + flushDeletes +
|
||||
" flushDocStores=" + flushDocStores +
|
||||
" numDocs=" + numDocs);
|
||||
|
||||
int docStoreOffset = docWriter.getDocStoreOffset();
|
||||
boolean docStoreIsCompoundFile = false;
|
||||
|
||||
// Check if the doc stores must be separately flushed
|
||||
// because other segments, besides the one we are about
|
||||
// to flush, reference it
|
||||
if (flushDocStores && (!flushDocs || !docWriter.getSegment().equals(docWriter.getDocStoreSegment()))) {
|
||||
// We must separately flush the doc store
|
||||
if (infoStream != null)
|
||||
infoStream.println(" flush shared docStore segment " + docStoreSegment);
|
||||
|
||||
flushDocStores();
|
||||
flushDocStores = false;
|
||||
docStoreIsCompoundFile = useCompoundFile;
|
||||
}
|
||||
|
||||
String segment = docWriter.getSegment();
|
||||
|
||||
if (flushDocs || flushDeletes) {
|
||||
|
||||
SegmentInfos rollback = null;
|
||||
|
||||
if (flushDeletes)
|
||||
rollback = (SegmentInfos) segmentInfos.clone();
|
||||
|
||||
boolean success = false;
|
||||
|
||||
try {
|
||||
if (flushDocs) {
|
||||
|
||||
if (0 == docStoreOffset && flushDocStores) {
|
||||
// This means we are flushing private doc stores
|
||||
// with this segment, so it will not be shared
|
||||
// with other segments
|
||||
assert docStoreSegment != null;
|
||||
assert docStoreSegment.equals(segment);
|
||||
docStoreOffset = -1;
|
||||
docStoreIsCompoundFile = false;
|
||||
docStoreSegment = null;
|
||||
}
|
||||
|
||||
int flushedDocCount = docWriter.flush(flushDocStores);
|
||||
|
||||
newSegment = new SegmentInfo(segment,
|
||||
flushedDocCount,
|
||||
directory, false, true,
|
||||
docStoreOffset, docStoreSegment,
|
||||
docStoreIsCompoundFile);
|
||||
segmentInfos.addElement(newSegment);
|
||||
}
|
||||
|
||||
if (flushDeletes) {
|
||||
// we should be able to change this so we can
|
||||
// buffer deletes longer and then flush them to
|
||||
// multiple flushed segments, when
|
||||
// autoCommit=false
|
||||
applyDeletes(flushDocs);
|
||||
doAfterFlush();
|
||||
}
|
||||
|
||||
checkpoint();
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
if (flushDeletes) {
|
||||
// Fully replace the segmentInfos since flushed
|
||||
// deletes could have changed any of the
|
||||
// SegmentInfo instances:
|
||||
segmentInfos.clear();
|
||||
segmentInfos.addAll(rollback);
|
||||
} else {
|
||||
// Remove segment we added, if any:
|
||||
if (newSegment != null &&
|
||||
segmentInfos.size() > 0 &&
|
||||
segmentInfos.info(segmentInfos.size()-1) == newSegment)
|
||||
segmentInfos.remove(segmentInfos.size()-1);
|
||||
}
|
||||
if (flushDocs)
|
||||
docWriter.abort();
|
||||
deleter.checkpoint(segmentInfos, false);
|
||||
deleter.refresh();
|
||||
}
|
||||
}
|
||||
|
||||
deleter.checkpoint(segmentInfos, autoCommit);
|
||||
|
||||
if (flushDocs && useCompoundFile) {
|
||||
success = false;
|
||||
try {
|
||||
docWriter.createCompoundFile(segment);
|
||||
newSegment.setUseCompoundFile(true);
|
||||
checkpoint();
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
newSegment.setUseCompoundFile(false);
|
||||
deleter.refresh();
|
||||
}
|
||||
}
|
||||
|
||||
deleter.checkpoint(segmentInfos, autoCommit);
|
||||
}
|
||||
|
||||
/* new merge policy
|
||||
if (0 == docWriter.getMaxBufferedDocs())
|
||||
maybeMergeSegments(mergeFactor * numDocs / 2);
|
||||
else
|
||||
maybeMergeSegments(docWriter.getMaxBufferedDocs());
|
||||
*/
|
||||
maybeMergeSegments(docWriter.getMaxBufferedDocs());
|
||||
}
|
||||
} finally {
|
||||
docWriter.clearFlushPending();
|
||||
docWriter.resumeAllThreads();
|
||||
}
|
||||
}
|
||||
|
||||
/** Expert: Return the total size of all index files currently cached in memory.
|
||||
|
@ -1761,15 +1999,15 @@ public class IndexWriter {
|
|||
*/
|
||||
public final long ramSizeInBytes() {
|
||||
ensureOpen();
|
||||
return ramDirectory.sizeInBytes();
|
||||
return docWriter.getRAMUsed();
|
||||
}
|
||||
|
||||
/** Expert: Return the number of documents whose segments are currently cached in memory.
|
||||
* Useful when calling flushRamSegments()
|
||||
* Useful when calling flush()
|
||||
*/
|
||||
public final synchronized int numRamDocs() {
|
||||
ensureOpen();
|
||||
return ramSegmentInfos.size();
|
||||
return docWriter.getNumDocsInRAM();
|
||||
}
|
||||
|
||||
/** Incremental segment merger. */
|
||||
|
@ -1777,6 +2015,10 @@ public class IndexWriter {
|
|||
long lowerBound = -1;
|
||||
long upperBound = startUpperBound;
|
||||
|
||||
/* new merge policy
|
||||
if (upperBound == 0) upperBound = 10;
|
||||
*/
|
||||
|
||||
while (upperBound < maxMergeDocs) {
|
||||
int minSegment = segmentInfos.size();
|
||||
int maxSegment = -1;
|
||||
|
@ -1808,7 +2050,7 @@ public class IndexWriter {
|
|||
while (numSegments >= mergeFactor) {
|
||||
// merge the leftmost* mergeFactor segments
|
||||
|
||||
int docCount = mergeSegments(segmentInfos, minSegment, minSegment + mergeFactor);
|
||||
int docCount = mergeSegments(minSegment, minSegment + mergeFactor);
|
||||
numSegments -= mergeFactor;
|
||||
|
||||
if (docCount > upperBound) {
|
||||
|
@ -1837,39 +2079,108 @@ public class IndexWriter {
|
|||
* Merges the named range of segments, replacing them in the stack with a
|
||||
* single segment.
|
||||
*/
|
||||
private final int mergeSegments(SegmentInfos sourceSegments, int minSegment, int end)
|
||||
|
||||
private final int mergeSegments(int minSegment, int end)
|
||||
throws CorruptIndexException, IOException {
|
||||
|
||||
// We may be called solely because there are deletes
|
||||
// pending, in which case doMerge is false:
|
||||
boolean doMerge = end > 0;
|
||||
final String mergedName = newSegmentName();
|
||||
|
||||
SegmentMerger merger = null;
|
||||
|
||||
final List ramSegmentsToDelete = new ArrayList();
|
||||
|
||||
SegmentInfo newSegment = null;
|
||||
|
||||
int mergedDocCount = 0;
|
||||
boolean anyDeletes = (bufferedDeleteTerms.size() != 0);
|
||||
|
||||
// This is try/finally to make sure merger's readers are closed:
|
||||
try {
|
||||
|
||||
if (doMerge) {
|
||||
if (infoStream != null) infoStream.print("merging segments");
|
||||
merger = new SegmentMerger(this, mergedName);
|
||||
if (infoStream != null) infoStream.print("merging segments");
|
||||
|
||||
for (int i = minSegment; i < end; i++) {
|
||||
SegmentInfo si = sourceSegments.info(i);
|
||||
if (infoStream != null)
|
||||
infoStream.print(" " + si.name + " (" + si.docCount + " docs)");
|
||||
IndexReader reader = SegmentReader.get(si, MERGE_READ_BUFFER_SIZE); // no need to set deleter (yet)
|
||||
merger.add(reader);
|
||||
if (reader.directory() == this.ramDirectory) {
|
||||
ramSegmentsToDelete.add(si);
|
||||
}
|
||||
}
|
||||
// Check whether this merge will allow us to skip
|
||||
// merging the doc stores (stored field & vectors).
|
||||
// This is a very substantial optimization (saves tons
|
||||
// of IO) that can only be applied with
|
||||
// autoCommit=false.
|
||||
|
||||
Directory lastDir = directory;
|
||||
String lastDocStoreSegment = null;
|
||||
boolean mergeDocStores = false;
|
||||
boolean doFlushDocStore = false;
|
||||
int next = -1;
|
||||
|
||||
// Test each segment to be merged
|
||||
for (int i = minSegment; i < end; i++) {
|
||||
SegmentInfo si = segmentInfos.info(i);
|
||||
|
||||
// If it has deletions we must merge the doc stores
|
||||
if (si.hasDeletions())
|
||||
mergeDocStores = true;
|
||||
|
||||
// If it has its own (private) doc stores we must
|
||||
// merge the doc stores
|
||||
if (-1 == si.getDocStoreOffset())
|
||||
mergeDocStores = true;
|
||||
|
||||
// If it has a different doc store segment than
|
||||
// previous segments, we must merge the doc stores
|
||||
String docStoreSegment = si.getDocStoreSegment();
|
||||
if (docStoreSegment == null)
|
||||
mergeDocStores = true;
|
||||
else if (lastDocStoreSegment == null)
|
||||
lastDocStoreSegment = docStoreSegment;
|
||||
else if (!lastDocStoreSegment.equals(docStoreSegment))
|
||||
mergeDocStores = true;
|
||||
|
||||
// Segments' docScoreOffsets must be in-order,
|
||||
// contiguous. For the default merge policy now
|
||||
// this will always be the case but for an arbitrary
|
||||
// merge policy this may not be the case
|
||||
if (-1 == next)
|
||||
next = si.getDocStoreOffset() + si.docCount;
|
||||
else if (next != si.getDocStoreOffset())
|
||||
mergeDocStores = true;
|
||||
else
|
||||
next = si.getDocStoreOffset() + si.docCount;
|
||||
|
||||
// If the segment comes from a different directory
|
||||
// we must merge
|
||||
if (lastDir != si.dir)
|
||||
mergeDocStores = true;
|
||||
|
||||
// If the segment is referencing the current "live"
|
||||
// doc store outputs then we must merge
|
||||
if (si.getDocStoreOffset() != -1 && si.getDocStoreSegment().equals(docWriter.getDocStoreSegment()))
|
||||
doFlushDocStore = true;
|
||||
}
|
||||
|
||||
final int docStoreOffset;
|
||||
final String docStoreSegment;
|
||||
final boolean docStoreIsCompoundFile;
|
||||
if (mergeDocStores) {
|
||||
docStoreOffset = -1;
|
||||
docStoreSegment = null;
|
||||
docStoreIsCompoundFile = false;
|
||||
} else {
|
||||
SegmentInfo si = segmentInfos.info(minSegment);
|
||||
docStoreOffset = si.getDocStoreOffset();
|
||||
docStoreSegment = si.getDocStoreSegment();
|
||||
docStoreIsCompoundFile = si.getDocStoreIsCompoundFile();
|
||||
}
|
||||
|
||||
if (mergeDocStores && doFlushDocStore)
|
||||
// SegmentMerger intends to merge the doc stores
|
||||
// (stored fields, vectors), and at least one of the
|
||||
// segments to be merged refers to the currently
|
||||
// live doc stores.
|
||||
flushDocStores();
|
||||
|
||||
merger = new SegmentMerger(this, mergedName);
|
||||
|
||||
for (int i = minSegment; i < end; i++) {
|
||||
SegmentInfo si = segmentInfos.info(i);
|
||||
if (infoStream != null)
|
||||
infoStream.print(" " + si.name + " (" + si.docCount + " docs)");
|
||||
IndexReader reader = SegmentReader.get(si, MERGE_READ_BUFFER_SIZE, mergeDocStores); // no need to set deleter (yet)
|
||||
merger.add(reader);
|
||||
}
|
||||
|
||||
SegmentInfos rollback = null;
|
||||
|
@ -1879,65 +2190,32 @@ public class IndexWriter {
|
|||
// if we hit exception when doing the merge:
|
||||
try {
|
||||
|
||||
if (doMerge) {
|
||||
mergedDocCount = merger.merge();
|
||||
mergedDocCount = merger.merge(mergeDocStores);
|
||||
|
||||
if (infoStream != null) {
|
||||
infoStream.println(" into "+mergedName+" ("+mergedDocCount+" docs)");
|
||||
}
|
||||
|
||||
newSegment = new SegmentInfo(mergedName, mergedDocCount,
|
||||
directory, false, true);
|
||||
if (infoStream != null) {
|
||||
infoStream.println(" into "+mergedName+" ("+mergedDocCount+" docs)");
|
||||
}
|
||||
|
||||
if (sourceSegments != ramSegmentInfos || anyDeletes) {
|
||||
// Now save the SegmentInfo instances that
|
||||
// we are replacing:
|
||||
rollback = (SegmentInfos) segmentInfos.clone();
|
||||
}
|
||||
newSegment = new SegmentInfo(mergedName, mergedDocCount,
|
||||
directory, false, true,
|
||||
docStoreOffset,
|
||||
docStoreSegment,
|
||||
docStoreIsCompoundFile);
|
||||
|
||||
if (doMerge) {
|
||||
if (sourceSegments == ramSegmentInfos) {
|
||||
segmentInfos.addElement(newSegment);
|
||||
} else {
|
||||
for (int i = end-1; i > minSegment; i--) // remove old infos & add new
|
||||
sourceSegments.remove(i);
|
||||
rollback = (SegmentInfos) segmentInfos.clone();
|
||||
|
||||
segmentInfos.set(minSegment, newSegment);
|
||||
}
|
||||
}
|
||||
for (int i = end-1; i > minSegment; i--) // remove old infos & add new
|
||||
segmentInfos.remove(i);
|
||||
|
||||
if (sourceSegments == ramSegmentInfos) {
|
||||
maybeApplyDeletes(doMerge);
|
||||
doAfterFlush();
|
||||
}
|
||||
segmentInfos.set(minSegment, newSegment);
|
||||
|
||||
checkpoint();
|
||||
|
||||
success = true;
|
||||
|
||||
} finally {
|
||||
|
||||
if (success) {
|
||||
// The non-ram-segments case is already committed
|
||||
// (above), so all the remains for ram segments case
|
||||
// is to clear the ram segments:
|
||||
if (sourceSegments == ramSegmentInfos) {
|
||||
ramSegmentInfos.removeAllElements();
|
||||
}
|
||||
} else {
|
||||
|
||||
// Must rollback so our state matches index:
|
||||
if (sourceSegments == ramSegmentInfos && !anyDeletes) {
|
||||
// Simple case: newSegment may or may not have
|
||||
// been added to the end of our segment infos,
|
||||
// so just check & remove if so:
|
||||
if (newSegment != null &&
|
||||
segmentInfos.size() > 0 &&
|
||||
segmentInfos.info(segmentInfos.size()-1) == newSegment) {
|
||||
segmentInfos.remove(segmentInfos.size()-1);
|
||||
}
|
||||
} else if (rollback != null) {
|
||||
if (!success) {
|
||||
if (rollback != null) {
|
||||
// Rollback the individual SegmentInfo
|
||||
// instances, but keep original SegmentInfos
|
||||
// instance (so we don't try to write again the
|
||||
|
@ -1952,16 +2230,13 @@ public class IndexWriter {
|
|||
}
|
||||
} finally {
|
||||
// close readers before we attempt to delete now-obsolete segments
|
||||
if (doMerge) merger.closeReaders();
|
||||
merger.closeReaders();
|
||||
}
|
||||
|
||||
// Delete the RAM segments
|
||||
deleter.deleteDirect(ramDirectory, ramSegmentsToDelete);
|
||||
|
||||
// Give deleter a chance to remove files now.
|
||||
deleter.checkpoint(segmentInfos, autoCommit);
|
||||
|
||||
if (useCompoundFile && doMerge) {
|
||||
if (useCompoundFile) {
|
||||
|
||||
boolean success = false;
|
||||
|
||||
|
@ -1988,19 +2263,23 @@ public class IndexWriter {
|
|||
}
|
||||
|
||||
// Called during flush to apply any buffered deletes. If
|
||||
// doMerge is true then a new segment was just created and
|
||||
// flushed from the ram segments.
|
||||
private final void maybeApplyDeletes(boolean doMerge) throws CorruptIndexException, IOException {
|
||||
// flushedNewSegment is true then a new segment was just
|
||||
// created and flushed from the ram segments, so we will
|
||||
// selectively apply the deletes to that new segment.
|
||||
private final void applyDeletes(boolean flushedNewSegment) throws CorruptIndexException, IOException {
|
||||
|
||||
if (bufferedDeleteTerms.size() > 0) {
|
||||
if (infoStream != null)
|
||||
infoStream.println("flush " + numBufferedDeleteTerms + " buffered deleted terms on "
|
||||
+ segmentInfos.size() + " segments.");
|
||||
|
||||
if (doMerge) {
|
||||
if (flushedNewSegment) {
|
||||
IndexReader reader = null;
|
||||
try {
|
||||
reader = SegmentReader.get(segmentInfos.info(segmentInfos.size() - 1));
|
||||
// Open readers w/o opening the stored fields /
|
||||
// vectors because these files may still be held
|
||||
// open for writing by docWriter
|
||||
reader = SegmentReader.get(segmentInfos.info(segmentInfos.size() - 1), false);
|
||||
|
||||
// Apply delete terms to the segment just flushed from ram
|
||||
// apply appropriately so that a delete term is only applied to
|
||||
|
@ -2018,14 +2297,14 @@ public class IndexWriter {
|
|||
}
|
||||
|
||||
int infosEnd = segmentInfos.size();
|
||||
if (doMerge) {
|
||||
if (flushedNewSegment) {
|
||||
infosEnd--;
|
||||
}
|
||||
|
||||
for (int i = 0; i < infosEnd; i++) {
|
||||
IndexReader reader = null;
|
||||
try {
|
||||
reader = SegmentReader.get(segmentInfos.info(i));
|
||||
reader = SegmentReader.get(segmentInfos.info(i), false);
|
||||
|
||||
// Apply delete terms to disk segments
|
||||
// except the one just flushed from ram.
|
||||
|
@ -2049,7 +2328,12 @@ public class IndexWriter {
|
|||
|
||||
private final boolean checkNonDecreasingLevels(int start) {
|
||||
int lowerBound = -1;
|
||||
int upperBound = minMergeDocs;
|
||||
int upperBound = docWriter.getMaxBufferedDocs();
|
||||
|
||||
/* new merge policy
|
||||
if (upperBound == 0)
|
||||
upperBound = 10;
|
||||
*/
|
||||
|
||||
for (int i = segmentInfos.size() - 1; i >= start; i--) {
|
||||
int docCount = segmentInfos.info(i).docCount;
|
||||
|
@ -2098,10 +2382,11 @@ public class IndexWriter {
|
|||
// well as the disk segments.
|
||||
private void bufferDeleteTerm(Term term) {
|
||||
Num num = (Num) bufferedDeleteTerms.get(term);
|
||||
int numDoc = docWriter.getNumDocsInRAM();
|
||||
if (num == null) {
|
||||
bufferedDeleteTerms.put(term, new Num(ramSegmentInfos.size()));
|
||||
bufferedDeleteTerms.put(term, new Num(numDoc));
|
||||
} else {
|
||||
num.setNum(ramSegmentInfos.size());
|
||||
num.setNum(numDoc);
|
||||
}
|
||||
numBufferedDeleteTerms++;
|
||||
}
|
||||
|
|
|
@ -65,6 +65,12 @@ final class SegmentInfo {
|
|||
private List files; // cached list of files that this segment uses
|
||||
// in the Directory
|
||||
|
||||
private int docStoreOffset; // if this segment shares stored fields & vectors, this
|
||||
// offset is where in that file this segment's docs begin
|
||||
private String docStoreSegment; // name used to derive fields/vectors file we share with
|
||||
// other segments
|
||||
private boolean docStoreIsCompoundFile; // whether doc store files are stored in compound file (*.cfx)
|
||||
|
||||
public SegmentInfo(String name, int docCount, Directory dir) {
|
||||
this.name = name;
|
||||
this.docCount = docCount;
|
||||
|
@ -73,13 +79,25 @@ final class SegmentInfo {
|
|||
isCompoundFile = CHECK_DIR;
|
||||
preLockless = true;
|
||||
hasSingleNormFile = false;
|
||||
docStoreOffset = -1;
|
||||
docStoreSegment = name;
|
||||
docStoreIsCompoundFile = false;
|
||||
}
|
||||
|
||||
public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean hasSingleNormFile) {
|
||||
this(name, docCount, dir, isCompoundFile, hasSingleNormFile, -1, null, false);
|
||||
}
|
||||
|
||||
public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean hasSingleNormFile,
|
||||
int docStoreOffset, String docStoreSegment, boolean docStoreIsCompoundFile) {
|
||||
this(name, docCount, dir);
|
||||
this.isCompoundFile = (byte) (isCompoundFile ? YES : NO);
|
||||
this.hasSingleNormFile = hasSingleNormFile;
|
||||
preLockless = false;
|
||||
this.docStoreOffset = docStoreOffset;
|
||||
this.docStoreSegment = docStoreSegment;
|
||||
this.docStoreIsCompoundFile = docStoreIsCompoundFile;
|
||||
assert docStoreOffset == -1 || docStoreSegment != null;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -92,6 +110,8 @@ final class SegmentInfo {
|
|||
dir = src.dir;
|
||||
preLockless = src.preLockless;
|
||||
delGen = src.delGen;
|
||||
docStoreOffset = src.docStoreOffset;
|
||||
docStoreIsCompoundFile = src.docStoreIsCompoundFile;
|
||||
if (src.normGen == null) {
|
||||
normGen = null;
|
||||
} else {
|
||||
|
@ -116,6 +136,20 @@ final class SegmentInfo {
|
|||
docCount = input.readInt();
|
||||
if (format <= SegmentInfos.FORMAT_LOCKLESS) {
|
||||
delGen = input.readLong();
|
||||
if (format <= SegmentInfos.FORMAT_SHARED_DOC_STORE) {
|
||||
docStoreOffset = input.readInt();
|
||||
if (docStoreOffset != -1) {
|
||||
docStoreSegment = input.readString();
|
||||
docStoreIsCompoundFile = (1 == input.readByte());
|
||||
} else {
|
||||
docStoreSegment = name;
|
||||
docStoreIsCompoundFile = false;
|
||||
}
|
||||
} else {
|
||||
docStoreOffset = -1;
|
||||
docStoreSegment = name;
|
||||
docStoreIsCompoundFile = false;
|
||||
}
|
||||
if (format <= SegmentInfos.FORMAT_SINGLE_NORM_FILE) {
|
||||
hasSingleNormFile = (1 == input.readByte());
|
||||
} else {
|
||||
|
@ -138,6 +172,9 @@ final class SegmentInfo {
|
|||
isCompoundFile = CHECK_DIR;
|
||||
preLockless = true;
|
||||
hasSingleNormFile = false;
|
||||
docStoreOffset = -1;
|
||||
docStoreIsCompoundFile = false;
|
||||
docStoreSegment = null;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -369,6 +406,28 @@ final class SegmentInfo {
|
|||
}
|
||||
}
|
||||
|
||||
int getDocStoreOffset() {
|
||||
return docStoreOffset;
|
||||
}
|
||||
|
||||
boolean getDocStoreIsCompoundFile() {
|
||||
return docStoreIsCompoundFile;
|
||||
}
|
||||
|
||||
void setDocStoreIsCompoundFile(boolean v) {
|
||||
docStoreIsCompoundFile = v;
|
||||
files = null;
|
||||
}
|
||||
|
||||
String getDocStoreSegment() {
|
||||
return docStoreSegment;
|
||||
}
|
||||
|
||||
void setDocStoreOffset(int offset) {
|
||||
docStoreOffset = offset;
|
||||
files = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Save this segment's info.
|
||||
*/
|
||||
|
@ -377,6 +436,12 @@ final class SegmentInfo {
|
|||
output.writeString(name);
|
||||
output.writeInt(docCount);
|
||||
output.writeLong(delGen);
|
||||
output.writeInt(docStoreOffset);
|
||||
if (docStoreOffset != -1) {
|
||||
output.writeString(docStoreSegment);
|
||||
output.writeByte((byte) (docStoreIsCompoundFile ? 1:0));
|
||||
}
|
||||
|
||||
output.writeByte((byte) (hasSingleNormFile ? 1:0));
|
||||
if (normGen == null) {
|
||||
output.writeInt(NO);
|
||||
|
@ -389,6 +454,11 @@ final class SegmentInfo {
|
|||
output.writeByte(isCompoundFile);
|
||||
}
|
||||
|
||||
private void addIfExists(List files, String fileName) throws IOException {
|
||||
if (dir.fileExists(fileName))
|
||||
files.add(fileName);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return all files referenced by this SegmentInfo. The
|
||||
* returns List is a locally cached List so you should not
|
||||
|
@ -409,13 +479,28 @@ final class SegmentInfo {
|
|||
if (useCompoundFile) {
|
||||
files.add(name + "." + IndexFileNames.COMPOUND_FILE_EXTENSION);
|
||||
} else {
|
||||
for (int i = 0; i < IndexFileNames.INDEX_EXTENSIONS_IN_COMPOUND_FILE.length; i++) {
|
||||
String ext = IndexFileNames.INDEX_EXTENSIONS_IN_COMPOUND_FILE[i];
|
||||
String fileName = name + "." + ext;
|
||||
if (dir.fileExists(fileName)) {
|
||||
files.add(fileName);
|
||||
}
|
||||
final String[] exts = IndexFileNames.NON_STORE_INDEX_EXTENSIONS;
|
||||
for(int i=0;i<exts.length;i++)
|
||||
addIfExists(files, name + "." + exts[i]);
|
||||
}
|
||||
|
||||
if (docStoreOffset != -1) {
|
||||
// We are sharing doc stores (stored fields, term
|
||||
// vectors) with other segments
|
||||
assert docStoreSegment != null;
|
||||
if (docStoreIsCompoundFile) {
|
||||
files.add(docStoreSegment + "." + IndexFileNames.COMPOUND_FILE_STORE_EXTENSION);
|
||||
} else {
|
||||
final String[] exts = IndexFileNames.STORE_INDEX_EXTENSIONS;
|
||||
for(int i=0;i<exts.length;i++)
|
||||
addIfExists(files, docStoreSegment + "." + exts[i]);
|
||||
}
|
||||
} else if (!useCompoundFile) {
|
||||
// We are not sharing, and, these files were not
|
||||
// included in the compound file
|
||||
final String[] exts = IndexFileNames.STORE_INDEX_EXTENSIONS;
|
||||
for(int i=0;i<exts.length;i++)
|
||||
addIfExists(files, name + "." + exts[i]);
|
||||
}
|
||||
|
||||
String delFileName = IndexFileNames.fileNameFromGeneration(name, "." + IndexFileNames.DELETES_EXTENSION, delGen);
|
||||
|
|
|
@ -51,8 +51,12 @@ final class SegmentInfos extends Vector {
|
|||
*/
|
||||
public static final int FORMAT_SINGLE_NORM_FILE = -3;
|
||||
|
||||
/** This format allows multiple segments to share a single
|
||||
* vectors and stored fields file. */
|
||||
public static final int FORMAT_SHARED_DOC_STORE = -4;
|
||||
|
||||
/* This must always point to the most recent file format. */
|
||||
private static final int CURRENT_FORMAT = FORMAT_SINGLE_NORM_FILE;
|
||||
private static final int CURRENT_FORMAT = FORMAT_SHARED_DOC_STORE;
|
||||
|
||||
public int counter = 0; // used to name new segments
|
||||
/**
|
||||
|
|
|
@ -52,6 +52,12 @@ final class SegmentMerger {
|
|||
|
||||
private int mergedDocs;
|
||||
|
||||
// Whether we should merge doc stores (stored fields and
|
||||
// vectors files). When all segments we are merging
|
||||
// already share the same doc store files, we don't need
|
||||
// to merge the doc stores.
|
||||
private boolean mergeDocStores;
|
||||
|
||||
/** This ctor used only by test code.
|
||||
*
|
||||
* @param dir The Directory to merge the other segments into
|
||||
|
@ -92,13 +98,27 @@ final class SegmentMerger {
|
|||
* @throws IOException if there is a low-level IO error
|
||||
*/
|
||||
final int merge() throws CorruptIndexException, IOException {
|
||||
int value;
|
||||
return merge(true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Merges the readers specified by the {@link #add} method
|
||||
* into the directory passed to the constructor.
|
||||
* @param mergeDocStores if false, we will not merge the
|
||||
* stored fields nor vectors files
|
||||
* @return The number of documents that were merged
|
||||
* @throws CorruptIndexException if the index is corrupt
|
||||
* @throws IOException if there is a low-level IO error
|
||||
*/
|
||||
final int merge(boolean mergeDocStores) throws CorruptIndexException, IOException {
|
||||
|
||||
this.mergeDocStores = mergeDocStores;
|
||||
|
||||
mergedDocs = mergeFields();
|
||||
mergeTerms();
|
||||
mergeNorms();
|
||||
|
||||
if (fieldInfos.hasVectors())
|
||||
if (mergeDocStores && fieldInfos.hasVectors())
|
||||
mergeVectors();
|
||||
|
||||
return mergedDocs;
|
||||
|
@ -126,7 +146,10 @@ final class SegmentMerger {
|
|||
|
||||
// Basic files
|
||||
for (int i = 0; i < IndexFileNames.COMPOUND_EXTENSIONS.length; i++) {
|
||||
files.add(segment + "." + IndexFileNames.COMPOUND_EXTENSIONS[i]);
|
||||
String ext = IndexFileNames.COMPOUND_EXTENSIONS[i];
|
||||
if (mergeDocStores || (!ext.equals(IndexFileNames.FIELDS_EXTENSION) &&
|
||||
!ext.equals(IndexFileNames.FIELDS_INDEX_EXTENSION)))
|
||||
files.add(segment + "." + ext);
|
||||
}
|
||||
|
||||
// Fieldable norm files
|
||||
|
@ -139,7 +162,7 @@ final class SegmentMerger {
|
|||
}
|
||||
|
||||
// Vector files
|
||||
if (fieldInfos.hasVectors()) {
|
||||
if (fieldInfos.hasVectors() && mergeDocStores) {
|
||||
for (int i = 0; i < IndexFileNames.VECTOR_EXTENSIONS.length; i++) {
|
||||
files.add(segment + "." + IndexFileNames.VECTOR_EXTENSIONS[i]);
|
||||
}
|
||||
|
@ -173,7 +196,20 @@ final class SegmentMerger {
|
|||
* @throws IOException if there is a low-level IO error
|
||||
*/
|
||||
private final int mergeFields() throws CorruptIndexException, IOException {
|
||||
fieldInfos = new FieldInfos(); // merge field names
|
||||
|
||||
if (!mergeDocStores) {
|
||||
// When we are not merging by doc stores, that means
|
||||
// all segments were written as part of a single
|
||||
// autoCommit=false IndexWriter session, so their field
|
||||
// name -> number mapping are the same. So, we start
|
||||
// with the fieldInfos of the last segment in this
|
||||
// case, to keep that numbering.
|
||||
final SegmentReader sr = (SegmentReader) readers.elementAt(readers.size()-1);
|
||||
fieldInfos = (FieldInfos) sr.fieldInfos.clone();
|
||||
} else {
|
||||
fieldInfos = new FieldInfos(); // merge field names
|
||||
}
|
||||
|
||||
int docCount = 0;
|
||||
for (int i = 0; i < readers.size(); i++) {
|
||||
IndexReader reader = (IndexReader) readers.elementAt(i);
|
||||
|
@ -187,30 +223,40 @@ final class SegmentMerger {
|
|||
}
|
||||
fieldInfos.write(directory, segment + ".fnm");
|
||||
|
||||
FieldsWriter fieldsWriter = // merge field values
|
||||
new FieldsWriter(directory, segment, fieldInfos);
|
||||
if (mergeDocStores) {
|
||||
|
||||
// for merging we don't want to compress/uncompress the data, so to tell the FieldsReader that we're
|
||||
// in merge mode, we use this FieldSelector
|
||||
FieldSelector fieldSelectorMerge = new FieldSelector() {
|
||||
public FieldSelectorResult accept(String fieldName) {
|
||||
return FieldSelectorResult.LOAD_FOR_MERGE;
|
||||
}
|
||||
};
|
||||
FieldsWriter fieldsWriter = // merge field values
|
||||
new FieldsWriter(directory, segment, fieldInfos);
|
||||
|
||||
try {
|
||||
for (int i = 0; i < readers.size(); i++) {
|
||||
IndexReader reader = (IndexReader) readers.elementAt(i);
|
||||
int maxDoc = reader.maxDoc();
|
||||
for (int j = 0; j < maxDoc; j++)
|
||||
if (!reader.isDeleted(j)) { // skip deleted docs
|
||||
fieldsWriter.addDocument(reader.document(j, fieldSelectorMerge));
|
||||
docCount++;
|
||||
// for merging we don't want to compress/uncompress the data, so to tell the FieldsReader that we're
|
||||
// in merge mode, we use this FieldSelector
|
||||
FieldSelector fieldSelectorMerge = new FieldSelector() {
|
||||
public FieldSelectorResult accept(String fieldName) {
|
||||
return FieldSelectorResult.LOAD_FOR_MERGE;
|
||||
}
|
||||
};
|
||||
|
||||
try {
|
||||
for (int i = 0; i < readers.size(); i++) {
|
||||
IndexReader reader = (IndexReader) readers.elementAt(i);
|
||||
int maxDoc = reader.maxDoc();
|
||||
for (int j = 0; j < maxDoc; j++)
|
||||
if (!reader.isDeleted(j)) { // skip deleted docs
|
||||
fieldsWriter.addDocument(reader.document(j, fieldSelectorMerge));
|
||||
docCount++;
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
fieldsWriter.close();
|
||||
}
|
||||
} finally {
|
||||
fieldsWriter.close();
|
||||
}
|
||||
|
||||
} else
|
||||
// If we are skipping the doc stores, that means there
|
||||
// are no deletions in any of these segments, so we
|
||||
// just sum numDocs() of each segment to get total docCount
|
||||
for (int i = 0; i < readers.size(); i++)
|
||||
docCount += ((IndexReader) readers.elementAt(i)).numDocs();
|
||||
|
||||
return docCount;
|
||||
}
|
||||
|
||||
|
@ -355,6 +401,7 @@ final class SegmentMerger {
|
|||
for (int i = 0; i < n; i++) {
|
||||
SegmentMergeInfo smi = smis[i];
|
||||
TermPositions postings = smi.getPositions();
|
||||
assert postings != null;
|
||||
int base = smi.base;
|
||||
int[] docMap = smi.getDocMap();
|
||||
postings.seek(smi.termEnum);
|
||||
|
|
|
@ -60,6 +60,7 @@ class SegmentReader extends IndexReader {
|
|||
|
||||
// Compound File Reader when based on a compound file segment
|
||||
CompoundFileReader cfsReader = null;
|
||||
CompoundFileReader storeCFSReader = null;
|
||||
|
||||
private class Norm {
|
||||
public Norm(IndexInput in, int number, long normSeek)
|
||||
|
@ -128,7 +129,15 @@ class SegmentReader extends IndexReader {
|
|||
* @throws IOException if there is a low-level IO error
|
||||
*/
|
||||
public static SegmentReader get(SegmentInfo si) throws CorruptIndexException, IOException {
|
||||
return get(si.dir, si, null, false, false, BufferedIndexInput.BUFFER_SIZE);
|
||||
return get(si.dir, si, null, false, false, BufferedIndexInput.BUFFER_SIZE, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws CorruptIndexException if the index is corrupt
|
||||
* @throws IOException if there is a low-level IO error
|
||||
*/
|
||||
public static SegmentReader get(SegmentInfo si, boolean doOpenStores) throws CorruptIndexException, IOException {
|
||||
return get(si.dir, si, null, false, false, BufferedIndexInput.BUFFER_SIZE, doOpenStores);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -136,7 +145,15 @@ class SegmentReader extends IndexReader {
|
|||
* @throws IOException if there is a low-level IO error
|
||||
*/
|
||||
public static SegmentReader get(SegmentInfo si, int readBufferSize) throws CorruptIndexException, IOException {
|
||||
return get(si.dir, si, null, false, false, readBufferSize);
|
||||
return get(si.dir, si, null, false, false, readBufferSize, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws CorruptIndexException if the index is corrupt
|
||||
* @throws IOException if there is a low-level IO error
|
||||
*/
|
||||
public static SegmentReader get(SegmentInfo si, int readBufferSize, boolean doOpenStores) throws CorruptIndexException, IOException {
|
||||
return get(si.dir, si, null, false, false, readBufferSize, doOpenStores);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -145,7 +162,7 @@ class SegmentReader extends IndexReader {
|
|||
*/
|
||||
public static SegmentReader get(SegmentInfos sis, SegmentInfo si,
|
||||
boolean closeDir) throws CorruptIndexException, IOException {
|
||||
return get(si.dir, si, sis, closeDir, true, BufferedIndexInput.BUFFER_SIZE);
|
||||
return get(si.dir, si, sis, closeDir, true, BufferedIndexInput.BUFFER_SIZE, true);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -157,6 +174,19 @@ class SegmentReader extends IndexReader {
|
|||
boolean closeDir, boolean ownDir,
|
||||
int readBufferSize)
|
||||
throws CorruptIndexException, IOException {
|
||||
return get(dir, si, sis, closeDir, ownDir, readBufferSize, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws CorruptIndexException if the index is corrupt
|
||||
* @throws IOException if there is a low-level IO error
|
||||
*/
|
||||
public static SegmentReader get(Directory dir, SegmentInfo si,
|
||||
SegmentInfos sis,
|
||||
boolean closeDir, boolean ownDir,
|
||||
int readBufferSize,
|
||||
boolean doOpenStores)
|
||||
throws CorruptIndexException, IOException {
|
||||
SegmentReader instance;
|
||||
try {
|
||||
instance = (SegmentReader)IMPL.newInstance();
|
||||
|
@ -164,11 +194,11 @@ class SegmentReader extends IndexReader {
|
|||
throw new RuntimeException("cannot load SegmentReader class: " + e, e);
|
||||
}
|
||||
instance.init(dir, sis, closeDir, ownDir);
|
||||
instance.initialize(si, readBufferSize);
|
||||
instance.initialize(si, readBufferSize, doOpenStores);
|
||||
return instance;
|
||||
}
|
||||
|
||||
private void initialize(SegmentInfo si, int readBufferSize) throws CorruptIndexException, IOException {
|
||||
private void initialize(SegmentInfo si, int readBufferSize, boolean doOpenStores) throws CorruptIndexException, IOException {
|
||||
segment = si.name;
|
||||
this.si = si;
|
||||
|
||||
|
@ -178,17 +208,45 @@ class SegmentReader extends IndexReader {
|
|||
// Use compound file directory for some files, if it exists
|
||||
Directory cfsDir = directory();
|
||||
if (si.getUseCompoundFile()) {
|
||||
cfsReader = new CompoundFileReader(directory(), segment + ".cfs", readBufferSize);
|
||||
cfsReader = new CompoundFileReader(directory(), segment + "." + IndexFileNames.COMPOUND_FILE_EXTENSION, readBufferSize);
|
||||
cfsDir = cfsReader;
|
||||
}
|
||||
|
||||
final Directory storeDir;
|
||||
|
||||
if (doOpenStores) {
|
||||
if (si.getDocStoreOffset() != -1) {
|
||||
if (si.getDocStoreIsCompoundFile()) {
|
||||
storeCFSReader = new CompoundFileReader(directory(), si.getDocStoreSegment() + "." + IndexFileNames.COMPOUND_FILE_STORE_EXTENSION, readBufferSize);
|
||||
storeDir = storeCFSReader;
|
||||
} else {
|
||||
storeDir = directory();
|
||||
}
|
||||
} else {
|
||||
storeDir = cfsDir;
|
||||
}
|
||||
} else
|
||||
storeDir = null;
|
||||
|
||||
// No compound file exists - use the multi-file format
|
||||
fieldInfos = new FieldInfos(cfsDir, segment + ".fnm");
|
||||
fieldsReader = new FieldsReader(cfsDir, segment, fieldInfos, readBufferSize);
|
||||
|
||||
// Verify two sources of "maxDoc" agree:
|
||||
if (fieldsReader.size() != si.docCount) {
|
||||
throw new CorruptIndexException("doc counts differ for segment " + si.name + ": fieldsReader shows " + fieldsReader.size() + " but segmentInfo shows " + si.docCount);
|
||||
final String fieldsSegment;
|
||||
final Directory dir;
|
||||
|
||||
if (si.getDocStoreOffset() != -1)
|
||||
fieldsSegment = si.getDocStoreSegment();
|
||||
else
|
||||
fieldsSegment = segment;
|
||||
|
||||
if (doOpenStores) {
|
||||
fieldsReader = new FieldsReader(storeDir, fieldsSegment, fieldInfos, readBufferSize,
|
||||
si.getDocStoreOffset(), si.docCount);
|
||||
|
||||
// Verify two sources of "maxDoc" agree:
|
||||
if (si.getDocStoreOffset() == -1 && fieldsReader.size() != si.docCount) {
|
||||
throw new CorruptIndexException("doc counts differ for segment " + si.name + ": fieldsReader shows " + fieldsReader.size() + " but segmentInfo shows " + si.docCount);
|
||||
}
|
||||
}
|
||||
|
||||
tis = new TermInfosReader(cfsDir, segment, fieldInfos, readBufferSize);
|
||||
|
@ -209,8 +267,13 @@ class SegmentReader extends IndexReader {
|
|||
proxStream = cfsDir.openInput(segment + ".prx", readBufferSize);
|
||||
openNorms(cfsDir, readBufferSize);
|
||||
|
||||
if (fieldInfos.hasVectors()) { // open term vector files only as needed
|
||||
termVectorsReaderOrig = new TermVectorsReader(cfsDir, segment, fieldInfos, readBufferSize);
|
||||
if (doOpenStores && fieldInfos.hasVectors()) { // open term vector files only as needed
|
||||
final String vectorsSegment;
|
||||
if (si.getDocStoreOffset() != -1)
|
||||
vectorsSegment = si.getDocStoreSegment();
|
||||
else
|
||||
vectorsSegment = segment;
|
||||
termVectorsReaderOrig = new TermVectorsReader(storeDir, vectorsSegment, fieldInfos, readBufferSize, si.getDocStoreOffset(), si.docCount);
|
||||
}
|
||||
success = true;
|
||||
} finally {
|
||||
|
@ -273,6 +336,9 @@ class SegmentReader extends IndexReader {
|
|||
|
||||
if (cfsReader != null)
|
||||
cfsReader.close();
|
||||
|
||||
if (storeCFSReader != null)
|
||||
storeCFSReader.close();
|
||||
}
|
||||
|
||||
static boolean hasDeletions(SegmentInfo si) throws IOException {
|
||||
|
|
|
@ -34,6 +34,10 @@ class TermVectorsReader implements Cloneable {
|
|||
private IndexInput tvf;
|
||||
private int size;
|
||||
|
||||
// The docID offset where our docs begin in the index
|
||||
// file. This will be 0 if we have our own private file.
|
||||
private int docStoreOffset;
|
||||
|
||||
private int tvdFormat;
|
||||
private int tvfFormat;
|
||||
|
||||
|
@ -43,6 +47,11 @@ class TermVectorsReader implements Cloneable {
|
|||
}
|
||||
|
||||
TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize)
|
||||
throws CorruptIndexException, IOException {
|
||||
this(d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE, -1, 0);
|
||||
}
|
||||
|
||||
TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize, int docStoreOffset, int size)
|
||||
throws CorruptIndexException, IOException {
|
||||
if (d.fileExists(segment + TermVectorsWriter.TVX_EXTENSION)) {
|
||||
tvx = d.openInput(segment + TermVectorsWriter.TVX_EXTENSION, readBufferSize);
|
||||
|
@ -51,7 +60,16 @@ class TermVectorsReader implements Cloneable {
|
|||
tvdFormat = checkValidFormat(tvd);
|
||||
tvf = d.openInput(segment + TermVectorsWriter.TVF_EXTENSION, readBufferSize);
|
||||
tvfFormat = checkValidFormat(tvf);
|
||||
size = (int) tvx.length() / 8;
|
||||
if (-1 == docStoreOffset) {
|
||||
this.docStoreOffset = 0;
|
||||
this.size = (int) (tvx.length() / 8);
|
||||
} else {
|
||||
this.docStoreOffset = docStoreOffset;
|
||||
this.size = size;
|
||||
// Verify the file is long enough to hold all of our
|
||||
// docs
|
||||
assert ((int) (tvx.length()/8)) >= size + docStoreOffset;
|
||||
}
|
||||
}
|
||||
|
||||
this.fieldInfos = fieldInfos;
|
||||
|
@ -102,7 +120,7 @@ class TermVectorsReader implements Cloneable {
|
|||
//We don't need to do this in other seeks because we already have the
|
||||
// file pointer
|
||||
//that was written in another file
|
||||
tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);
|
||||
tvx.seek(((docNum + docStoreOffset) * 8L) + TermVectorsWriter.FORMAT_SIZE);
|
||||
//System.out.println("TVX Pointer: " + tvx.getFilePointer());
|
||||
long position = tvx.readLong();
|
||||
|
||||
|
@ -154,7 +172,7 @@ class TermVectorsReader implements Cloneable {
|
|||
// Check if no term vectors are available for this segment at all
|
||||
if (tvx != null) {
|
||||
//We need to offset by
|
||||
tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);
|
||||
tvx.seek(((docNum + docStoreOffset) * 8L) + TermVectorsWriter.FORMAT_SIZE);
|
||||
long position = tvx.readLong();
|
||||
|
||||
tvd.seek(position);
|
||||
|
|
|
@ -125,6 +125,31 @@ public abstract class IndexOutput {
|
|||
}
|
||||
}
|
||||
|
||||
/** Writes a sequence of UTF-8 encoded characters from a char[].
|
||||
* @param s the source of the characters
|
||||
* @param start the first character in the sequence
|
||||
* @param length the number of characters in the sequence
|
||||
* @see IndexInput#readChars(char[],int,int)
|
||||
*/
|
||||
public void writeChars(char[] s, int start, int length)
|
||||
throws IOException {
|
||||
final int end = start + length;
|
||||
for (int i = start; i < end; i++) {
|
||||
final int code = (int)s[i];
|
||||
if (code >= 0x01 && code <= 0x7F)
|
||||
writeByte((byte)code);
|
||||
else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) {
|
||||
writeByte((byte)(0xC0 | (code >> 6)));
|
||||
writeByte((byte)(0x80 | (code & 0x3F)));
|
||||
} else {
|
||||
writeByte((byte)(0xE0 | (code >>> 12)));
|
||||
writeByte((byte)(0x80 | ((code >> 6) & 0x3F)));
|
||||
writeByte((byte)(0x80 | (code & 0x3F)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** Forces any buffered output to be written. */
|
||||
public abstract void flush() throws IOException;
|
||||
|
||||
|
|
|
@ -60,6 +60,15 @@
|
|||
Lucene will not be able to read the index.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
In version 2.3, the file format was changed to allow
|
||||
segments to share a single set of doc store (vectors &
|
||||
stored fields) files. This allows for faster indexing
|
||||
in certain cases. The change is fully backwards
|
||||
compatible (in the same way as the lock-less commits
|
||||
change in 2.1).
|
||||
</p>
|
||||
|
||||
</section>
|
||||
|
||||
<section id="Definitions"><title>Definitions</title>
|
||||
|
@ -809,9 +818,15 @@
|
|||
NormGen<sup>NumField</sup>,
|
||||
IsCompoundFile><sup>SegCount</sup>
|
||||
</p>
|
||||
<p>
|
||||
<b>2.3 and above:</b>
|
||||
Segments --> Format, Version, NameCounter, SegCount, <SegName, SegSize, DelGen, DocStoreOffset, [DocStoreSegment, DocStoreIsCompoundFile], HasSingleNormFile, NumField,
|
||||
NormGen<sup>NumField</sup>,
|
||||
IsCompoundFile><sup>SegCount</sup>
|
||||
</p>
|
||||
|
||||
<p>
|
||||
Format, NameCounter, SegCount, SegSize, NumField --> Int32
|
||||
Format, NameCounter, SegCount, SegSize, NumField, DocStoreOffset --> Int32
|
||||
</p>
|
||||
|
||||
<p>
|
||||
|
@ -819,11 +834,11 @@
|
|||
</p>
|
||||
|
||||
<p>
|
||||
SegName --> String
|
||||
SegName, DocStoreSegment --> String
|
||||
</p>
|
||||
|
||||
<p>
|
||||
IsCompoundFile, HasSingleNormFile --> Int8
|
||||
IsCompoundFile, HasSingleNormFile, DocStoreIsCompoundFile --> Int8
|
||||
</p>
|
||||
|
||||
<p>
|
||||
|
@ -889,6 +904,29 @@
|
|||
"Normalization Factors" below for details.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
DocStoreOffset, DocStoreSegment,
|
||||
DocStoreIsCompoundFile: If DocStoreOffset is -1,
|
||||
this segment has its own doc store (stored fields
|
||||
values and term vectors) files and DocStoreSegment
|
||||
and DocStoreIsCompoundFile are not stored. In
|
||||
this case all files for stored field values
|
||||
(<tt>*.fdt</tt> and <tt>*.fdx</tt>) and term
|
||||
vectors (<tt>*.tvf</tt>, <tt>*.tvd</tt> and
|
||||
<tt>*.tvx</tt>) will be stored with this segment.
|
||||
Otherwise, DocStoreSegment is the name of the
|
||||
segment that has the shared doc store files;
|
||||
DocStoreIsCompoundFile is 1 if that segment is
|
||||
stored in compound file format (as a <tt>.cfx</tt>
|
||||
file); and DocStoreOffset is the starting document
|
||||
in the shared doc store files where this segment's
|
||||
documents begin. In this case, this segment does
|
||||
not store its own doc store files but instead
|
||||
shares a single set of these files with other
|
||||
segments.
|
||||
</p>
|
||||
|
||||
|
||||
</section>
|
||||
|
||||
<section id="Lock File"><title>Lock File</title>
|
||||
|
@ -947,6 +985,14 @@
|
|||
<p>FileData --> raw file data</p>
|
||||
<p>The raw file data is the data from the individual files named above.</p>
|
||||
|
||||
<p>Starting with Lucene 2.3, doc store files (stored
|
||||
field values and term vectors) can be shared in a
|
||||
single set of files for more than one segment. When
|
||||
compound file is enabled, these shared files will be
|
||||
added into a single compound file (same format as
|
||||
above) but with the extension <tt>.cfx</tt>.
|
||||
</p>
|
||||
|
||||
</section>
|
||||
|
||||
</section>
|
||||
|
|
|
@ -106,8 +106,12 @@ public class TestBackwardsCompatibility extends TestCase
|
|||
rmDir(dirName);
|
||||
}
|
||||
|
||||
final String[] oldNames = {"prelockless.cfs",
|
||||
"prelockless.nocfs",
|
||||
"presharedstores.cfs",
|
||||
"presharedstores.nocfs"};
|
||||
|
||||
public void testSearchOldIndex() throws IOException {
|
||||
String[] oldNames = {"prelockless.cfs", "prelockless.nocfs"};
|
||||
for(int i=0;i<oldNames.length;i++) {
|
||||
String dirName = "src/test/org/apache/lucene/index/index." + oldNames[i];
|
||||
unzip(dirName, oldNames[i]);
|
||||
|
@ -117,7 +121,6 @@ public class TestBackwardsCompatibility extends TestCase
|
|||
}
|
||||
|
||||
public void testIndexOldIndexNoAdds() throws IOException {
|
||||
String[] oldNames = {"prelockless.cfs", "prelockless.nocfs"};
|
||||
for(int i=0;i<oldNames.length;i++) {
|
||||
String dirName = "src/test/org/apache/lucene/index/index." + oldNames[i];
|
||||
unzip(dirName, oldNames[i]);
|
||||
|
@ -131,7 +134,6 @@ public class TestBackwardsCompatibility extends TestCase
|
|||
}
|
||||
|
||||
public void testIndexOldIndex() throws IOException {
|
||||
String[] oldNames = {"prelockless.cfs", "prelockless.nocfs"};
|
||||
for(int i=0;i<oldNames.length;i++) {
|
||||
String dirName = "src/test/org/apache/lucene/index/index." + oldNames[i];
|
||||
unzip(dirName, oldNames[i]);
|
||||
|
@ -314,6 +316,7 @@ public class TestBackwardsCompatibility extends TestCase
|
|||
boolean autoCommit = 0 == pass;
|
||||
|
||||
IndexWriter writer = new IndexWriter(dir, autoCommit, new WhitespaceAnalyzer(), true);
|
||||
writer.setRAMBufferSizeMB(16.0);
|
||||
//IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
|
||||
for(int i=0;i<35;i++) {
|
||||
addDoc(writer, i);
|
||||
|
@ -337,8 +340,8 @@ public class TestBackwardsCompatibility extends TestCase
|
|||
// figure out which field number corresponds to
|
||||
// "content", and then set our expected file names below
|
||||
// accordingly:
|
||||
CompoundFileReader cfsReader = new CompoundFileReader(dir, "_2.cfs");
|
||||
FieldInfos fieldInfos = new FieldInfos(cfsReader, "_2.fnm");
|
||||
CompoundFileReader cfsReader = new CompoundFileReader(dir, "_0.cfs");
|
||||
FieldInfos fieldInfos = new FieldInfos(cfsReader, "_0.fnm");
|
||||
int contentFieldIndex = -1;
|
||||
for(int i=0;i<fieldInfos.size();i++) {
|
||||
FieldInfo fi = fieldInfos.fieldInfo(i);
|
||||
|
@ -351,17 +354,15 @@ public class TestBackwardsCompatibility extends TestCase
|
|||
assertTrue("could not locate the 'content' field number in the _2.cfs segment", contentFieldIndex != -1);
|
||||
|
||||
// Now verify file names:
|
||||
String[] expected = {"_0.cfs",
|
||||
"_0_1.del",
|
||||
"_1.cfs",
|
||||
"_2.cfs",
|
||||
"_2_1.s" + contentFieldIndex,
|
||||
"_3.cfs",
|
||||
"segments_a",
|
||||
"segments.gen"};
|
||||
if (!autoCommit) {
|
||||
expected[6] = "segments_3";
|
||||
}
|
||||
String[] expected;
|
||||
expected = new String[] {"_0.cfs",
|
||||
"_0_1.del",
|
||||
"_0_1.s" + contentFieldIndex,
|
||||
"segments_4",
|
||||
"segments.gen"};
|
||||
|
||||
if (!autoCommit)
|
||||
expected[3] = "segments_3";
|
||||
|
||||
String[] actual = dir.list();
|
||||
Arrays.sort(expected);
|
||||
|
|
|
@ -256,6 +256,7 @@ public class TestDeletionPolicy extends TestCase
|
|||
Directory dir = new RAMDirectory();
|
||||
|
||||
IndexWriter writer = new IndexWriter(dir, autoCommit, new WhitespaceAnalyzer(), true, policy);
|
||||
writer.setMaxBufferedDocs(10);
|
||||
writer.setUseCompoundFile(useCompoundFile);
|
||||
for(int i=0;i<107;i++) {
|
||||
addDoc(writer);
|
||||
|
@ -318,6 +319,7 @@ public class TestDeletionPolicy extends TestCase
|
|||
Directory dir = new RAMDirectory();
|
||||
|
||||
IndexWriter writer = new IndexWriter(dir, autoCommit, new WhitespaceAnalyzer(), true, policy);
|
||||
writer.setMaxBufferedDocs(10);
|
||||
writer.setUseCompoundFile(useCompoundFile);
|
||||
for(int i=0;i<107;i++) {
|
||||
addDoc(writer);
|
||||
|
@ -365,6 +367,7 @@ public class TestDeletionPolicy extends TestCase
|
|||
|
||||
for(int j=0;j<N+1;j++) {
|
||||
IndexWriter writer = new IndexWriter(dir, autoCommit, new WhitespaceAnalyzer(), true, policy);
|
||||
writer.setMaxBufferedDocs(10);
|
||||
writer.setUseCompoundFile(useCompoundFile);
|
||||
for(int i=0;i<17;i++) {
|
||||
addDoc(writer);
|
||||
|
@ -525,6 +528,7 @@ public class TestDeletionPolicy extends TestCase
|
|||
|
||||
Directory dir = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, autoCommit, new WhitespaceAnalyzer(), true, policy);
|
||||
writer.setMaxBufferedDocs(10);
|
||||
writer.setUseCompoundFile(useCompoundFile);
|
||||
writer.close();
|
||||
Term searchTerm = new Term("content", "aaa");
|
||||
|
@ -533,6 +537,7 @@ public class TestDeletionPolicy extends TestCase
|
|||
for(int i=0;i<N+1;i++) {
|
||||
|
||||
writer = new IndexWriter(dir, autoCommit, new WhitespaceAnalyzer(), false, policy);
|
||||
writer.setMaxBufferedDocs(10);
|
||||
writer.setUseCompoundFile(useCompoundFile);
|
||||
for(int j=0;j<17;j++) {
|
||||
addDoc(writer);
|
||||
|
|
|
@ -51,6 +51,7 @@ public class TestIndexFileDeleter extends TestCase
|
|||
Directory dir = new RAMDirectory();
|
||||
|
||||
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
|
||||
writer.setMaxBufferedDocs(10);
|
||||
int i;
|
||||
for(i=0;i<35;i++) {
|
||||
addDoc(writer, i);
|
||||
|
|
|
@ -74,6 +74,9 @@ public class TestIndexModifier extends TestCase {
|
|||
// Lucene defaults:
|
||||
assertNull(i.getInfoStream());
|
||||
assertTrue(i.getUseCompoundFile());
|
||||
/* new merge policy
|
||||
assertEquals(0, i.getMaxBufferedDocs());
|
||||
*/
|
||||
assertEquals(10, i.getMaxBufferedDocs());
|
||||
assertEquals(10000, i.getMaxFieldLength());
|
||||
assertEquals(10, i.getMergeFactor());
|
||||
|
|
|
@ -803,7 +803,7 @@ public class TestIndexReader extends TestCase
|
|||
String[] startFiles = dir.list();
|
||||
SegmentInfos infos = new SegmentInfos();
|
||||
infos.read(dir);
|
||||
IndexFileDeleter d = new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null);
|
||||
IndexFileDeleter d = new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null, null);
|
||||
String[] endFiles = dir.list();
|
||||
|
||||
Arrays.sort(startFiles);
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.index;
|
|||
import java.io.IOException;
|
||||
import java.io.File;
|
||||
import java.util.Arrays;
|
||||
import java.util.Random;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
|
@ -478,7 +479,7 @@ public class TestIndexWriter extends TestCase
|
|||
String[] startFiles = dir.list();
|
||||
SegmentInfos infos = new SegmentInfos();
|
||||
infos.read(dir);
|
||||
IndexFileDeleter d = new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null);
|
||||
IndexFileDeleter d = new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null, null);
|
||||
String[] endFiles = dir.list();
|
||||
|
||||
Arrays.sort(startFiles);
|
||||
|
@ -859,6 +860,7 @@ public class TestIndexWriter extends TestCase
|
|||
public void testCommitOnCloseAbort() throws IOException {
|
||||
Directory dir = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
|
||||
writer.setMaxBufferedDocs(10);
|
||||
for (int i = 0; i < 14; i++) {
|
||||
addDoc(writer);
|
||||
}
|
||||
|
@ -871,6 +873,7 @@ public class TestIndexWriter extends TestCase
|
|||
searcher.close();
|
||||
|
||||
writer = new IndexWriter(dir, false, new WhitespaceAnalyzer(), false);
|
||||
writer.setMaxBufferedDocs(10);
|
||||
for(int j=0;j<17;j++) {
|
||||
addDoc(writer);
|
||||
}
|
||||
|
@ -895,6 +898,7 @@ public class TestIndexWriter extends TestCase
|
|||
// Now make sure we can re-open the index, add docs,
|
||||
// and all is good:
|
||||
writer = new IndexWriter(dir, false, new WhitespaceAnalyzer(), false);
|
||||
writer.setMaxBufferedDocs(10);
|
||||
for(int i=0;i<12;i++) {
|
||||
for(int j=0;j<17;j++) {
|
||||
addDoc(writer);
|
||||
|
@ -962,6 +966,7 @@ public class TestIndexWriter extends TestCase
|
|||
public void testCommitOnCloseOptimize() throws IOException {
|
||||
RAMDirectory dir = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
|
||||
writer.setMaxBufferedDocs(10);
|
||||
for(int j=0;j<17;j++) {
|
||||
addDocWithIndex(writer, j);
|
||||
}
|
||||
|
@ -1002,6 +1007,255 @@ public class TestIndexWriter extends TestCase
|
|||
reader.close();
|
||||
}
|
||||
|
||||
public void testIndexNoDocuments() throws IOException {
|
||||
RAMDirectory dir = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
|
||||
writer.flush();
|
||||
writer.close();
|
||||
|
||||
IndexReader reader = IndexReader.open(dir);
|
||||
assertEquals(0, reader.maxDoc());
|
||||
assertEquals(0, reader.numDocs());
|
||||
reader.close();
|
||||
|
||||
writer = new IndexWriter(dir, new WhitespaceAnalyzer(), false);
|
||||
writer.flush();
|
||||
writer.close();
|
||||
|
||||
reader = IndexReader.open(dir);
|
||||
assertEquals(0, reader.maxDoc());
|
||||
assertEquals(0, reader.numDocs());
|
||||
reader.close();
|
||||
}
|
||||
|
||||
public void testManyFields() throws IOException {
|
||||
RAMDirectory dir = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
|
||||
writer.setMaxBufferedDocs(10);
|
||||
for(int j=0;j<100;j++) {
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("a"+j, "aaa" + j, Field.Store.YES, Field.Index.TOKENIZED));
|
||||
doc.add(new Field("b"+j, "aaa" + j, Field.Store.YES, Field.Index.TOKENIZED));
|
||||
doc.add(new Field("c"+j, "aaa" + j, Field.Store.YES, Field.Index.TOKENIZED));
|
||||
doc.add(new Field("d"+j, "aaa", Field.Store.YES, Field.Index.TOKENIZED));
|
||||
doc.add(new Field("e"+j, "aaa", Field.Store.YES, Field.Index.TOKENIZED));
|
||||
doc.add(new Field("f"+j, "aaa", Field.Store.YES, Field.Index.TOKENIZED));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
writer.close();
|
||||
|
||||
IndexReader reader = IndexReader.open(dir);
|
||||
assertEquals(100, reader.maxDoc());
|
||||
assertEquals(100, reader.numDocs());
|
||||
for(int j=0;j<100;j++) {
|
||||
assertEquals(1, reader.docFreq(new Term("a"+j, "aaa"+j)));
|
||||
assertEquals(1, reader.docFreq(new Term("b"+j, "aaa"+j)));
|
||||
assertEquals(1, reader.docFreq(new Term("c"+j, "aaa"+j)));
|
||||
assertEquals(1, reader.docFreq(new Term("d"+j, "aaa")));
|
||||
assertEquals(1, reader.docFreq(new Term("e"+j, "aaa")));
|
||||
assertEquals(1, reader.docFreq(new Term("f"+j, "aaa")));
|
||||
}
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testSmallRAMBuffer() throws IOException {
|
||||
RAMDirectory dir = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
|
||||
writer.setRAMBufferSizeMB(0.000001);
|
||||
int lastNumFile = dir.list().length;
|
||||
for(int j=0;j<9;j++) {
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("field", "aaa" + j, Field.Store.YES, Field.Index.TOKENIZED));
|
||||
writer.addDocument(doc);
|
||||
int numFile = dir.list().length;
|
||||
// Verify that with a tiny RAM buffer we see new
|
||||
// segment after every doc
|
||||
assertTrue(numFile > lastNumFile);
|
||||
lastNumFile = numFile;
|
||||
}
|
||||
writer.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
// Make sure it's OK to change RAM buffer size and
|
||||
// maxBufferedDocs in a write session
|
||||
public void testChangingRAMBuffer() throws IOException {
|
||||
RAMDirectory dir = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
|
||||
writer.setMaxBufferedDocs(10);
|
||||
int lastNumFile = dir.list().length;
|
||||
long lastGen = -1;
|
||||
for(int j=1;j<52;j++) {
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("field", "aaa" + j, Field.Store.YES, Field.Index.TOKENIZED));
|
||||
writer.addDocument(doc);
|
||||
long gen = SegmentInfos.generationFromSegmentsFileName(SegmentInfos.getCurrentSegmentFileName(dir.list()));
|
||||
if (j == 1)
|
||||
lastGen = gen;
|
||||
else if (j < 10)
|
||||
// No new files should be created
|
||||
assertEquals(gen, lastGen);
|
||||
else if (10 == j) {
|
||||
assertTrue(gen > lastGen);
|
||||
lastGen = gen;
|
||||
writer.setRAMBufferSizeMB(0.000001);
|
||||
} else if (j < 20) {
|
||||
assertTrue(gen > lastGen);
|
||||
lastGen = gen;
|
||||
} else if (20 == j) {
|
||||
writer.setRAMBufferSizeMB(16);
|
||||
lastGen = gen;
|
||||
} else if (j < 30) {
|
||||
assertEquals(gen, lastGen);
|
||||
} else if (30 == j) {
|
||||
writer.setRAMBufferSizeMB(0.000001);
|
||||
} else if (j < 40) {
|
||||
assertTrue(gen> lastGen);
|
||||
lastGen = gen;
|
||||
} else if (40 == j) {
|
||||
writer.setMaxBufferedDocs(10);
|
||||
lastGen = gen;
|
||||
} else if (j < 50) {
|
||||
assertEquals(gen, lastGen);
|
||||
writer.setMaxBufferedDocs(10);
|
||||
} else if (50 == j) {
|
||||
assertTrue(gen > lastGen);
|
||||
}
|
||||
}
|
||||
writer.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testDiverseDocs() throws IOException {
|
||||
RAMDirectory dir = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
|
||||
// writer.setInfoStream(System.out);
|
||||
long t0 = System.currentTimeMillis();
|
||||
writer.setRAMBufferSizeMB(0.5);
|
||||
Random rand = new Random(31415);
|
||||
for(int i=0;i<3;i++) {
|
||||
// First, docs where every term is unique (heavy on
|
||||
// Posting instances)
|
||||
for(int j=0;j<100;j++) {
|
||||
Document doc = new Document();
|
||||
for(int k=0;k<100;k++) {
|
||||
doc.add(new Field("field", Integer.toString(rand.nextInt()), Field.Store.YES, Field.Index.TOKENIZED));
|
||||
}
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
// Next, many single term docs where only one term
|
||||
// occurs (heavy on byte blocks)
|
||||
for(int j=0;j<100;j++) {
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("field", "aaa aaa aaa aaa aaa aaa aaa aaa aaa aaa", Field.Store.YES, Field.Index.TOKENIZED));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
// Next, many single term docs where only one term
|
||||
// occurs but the terms are very long (heavy on
|
||||
// char[] arrays)
|
||||
for(int j=0;j<100;j++) {
|
||||
StringBuffer b = new StringBuffer();
|
||||
String x = Integer.toString(j) + ".";
|
||||
for(int k=0;k<1000;k++)
|
||||
b.append(x);
|
||||
String longTerm = b.toString();
|
||||
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("field", longTerm, Field.Store.YES, Field.Index.TOKENIZED));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
}
|
||||
writer.close();
|
||||
|
||||
long t1 = System.currentTimeMillis();
|
||||
IndexSearcher searcher = new IndexSearcher(dir);
|
||||
Hits hits = searcher.search(new TermQuery(new Term("field", "aaa")));
|
||||
assertEquals(300, hits.length());
|
||||
searcher.close();
|
||||
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testEnablingNorms() throws IOException {
|
||||
RAMDirectory dir = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
|
||||
writer.setMaxBufferedDocs(10);
|
||||
// Enable norms for only 1 doc, pre flush
|
||||
for(int j=0;j<10;j++) {
|
||||
Document doc = new Document();
|
||||
Field f = new Field("field", "aaa", Field.Store.YES, Field.Index.TOKENIZED);
|
||||
if (j != 8) {
|
||||
f.setOmitNorms(true);
|
||||
}
|
||||
doc.add(f);
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
writer.close();
|
||||
|
||||
Term searchTerm = new Term("field", "aaa");
|
||||
|
||||
IndexSearcher searcher = new IndexSearcher(dir);
|
||||
Hits hits = searcher.search(new TermQuery(searchTerm));
|
||||
assertEquals(10, hits.length());
|
||||
searcher.close();
|
||||
|
||||
writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
|
||||
writer.setMaxBufferedDocs(10);
|
||||
// Enable norms for only 1 doc, post flush
|
||||
for(int j=0;j<27;j++) {
|
||||
Document doc = new Document();
|
||||
Field f = new Field("field", "aaa", Field.Store.YES, Field.Index.TOKENIZED);
|
||||
if (j != 26) {
|
||||
f.setOmitNorms(true);
|
||||
}
|
||||
doc.add(f);
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
writer.close();
|
||||
searcher = new IndexSearcher(dir);
|
||||
hits = searcher.search(new TermQuery(searchTerm));
|
||||
assertEquals(27, hits.length());
|
||||
searcher.close();
|
||||
|
||||
IndexReader reader = IndexReader.open(dir);
|
||||
reader.close();
|
||||
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testHighFreqTerm() throws IOException {
|
||||
RAMDirectory dir = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
|
||||
writer.setRAMBufferSizeMB(0.01);
|
||||
writer.setMaxFieldLength(100000000);
|
||||
// Massive doc that has 128 K a's
|
||||
StringBuffer b = new StringBuffer(1024*1024);
|
||||
for(int i=0;i<4096;i++) {
|
||||
b.append(" a a a a a a a a");
|
||||
b.append(" a a a a a a a a");
|
||||
b.append(" a a a a a a a a");
|
||||
b.append(" a a a a a a a a");
|
||||
}
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("field", b.toString(), Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
|
||||
writer.addDocument(doc);
|
||||
writer.close();
|
||||
|
||||
IndexReader reader = IndexReader.open(dir);
|
||||
assertEquals(1, reader.maxDoc());
|
||||
assertEquals(1, reader.numDocs());
|
||||
Term t = new Term("field", "a");
|
||||
assertEquals(1, reader.docFreq(t));
|
||||
TermDocs td = reader.termDocs(t);
|
||||
td.next();
|
||||
assertEquals(128*1024, td.freq());
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
// Make sure that a Directory implementation that does
|
||||
// not use LockFactory at all (ie overrides makeLock and
|
||||
// implements its own private locking) works OK. This
|
||||
|
|
|
@ -110,7 +110,7 @@ public class TestIndexWriterDelete extends TestCase {
|
|||
}
|
||||
modifier.flush();
|
||||
|
||||
assertEquals(0, modifier.getRamSegmentCount());
|
||||
assertEquals(0, modifier.getNumBufferedDocuments());
|
||||
assertTrue(0 < modifier.getSegmentCount());
|
||||
|
||||
if (!autoCommit) {
|
||||
|
@ -452,7 +452,7 @@ public class TestIndexWriterDelete extends TestCase {
|
|||
String[] startFiles = dir.list();
|
||||
SegmentInfos infos = new SegmentInfos();
|
||||
infos.read(dir);
|
||||
IndexFileDeleter d = new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null);
|
||||
IndexFileDeleter d = new IndexFileDeleter(dir, new KeepOnlyLastCommitDeletionPolicy(), infos, null, null);
|
||||
String[] endFiles = dir.list();
|
||||
|
||||
Arrays.sort(startFiles);
|
||||
|
|
|
@ -57,7 +57,7 @@ public class TestIndexWriterMergePolicy extends TestCase {
|
|||
for (int i = 0; i < 100; i++) {
|
||||
addDoc(writer);
|
||||
checkInvariants(writer);
|
||||
if (writer.getRamSegmentCount() + writer.getSegmentCount() >= 18) {
|
||||
if (writer.getNumBufferedDocuments() + writer.getSegmentCount() >= 18) {
|
||||
noOverMerge = true;
|
||||
}
|
||||
}
|
||||
|
@ -195,7 +195,7 @@ public class TestIndexWriterMergePolicy extends TestCase {
|
|||
int mergeFactor = writer.getMergeFactor();
|
||||
int maxMergeDocs = writer.getMaxMergeDocs();
|
||||
|
||||
int ramSegmentCount = writer.getRamSegmentCount();
|
||||
int ramSegmentCount = writer.getNumBufferedDocuments();
|
||||
assertTrue(ramSegmentCount < maxBufferedDocs);
|
||||
|
||||
int lowerBound = -1;
|
||||
|
|
|
@ -50,7 +50,7 @@ public class TestLazyProxSkipping extends TestCase {
|
|||
|
||||
Directory directory = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true);
|
||||
|
||||
writer.setMaxBufferedDocs(10);
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
Document doc = new Document();
|
||||
String content;
|
||||
|
|
|
@ -467,7 +467,8 @@ public class TestPayloads extends TestCase {
|
|||
d.add(new Field(field, new PoolingPayloadTokenStream(pool)));
|
||||
writer.addDocument(d);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
fail(e.toString());
|
||||
}
|
||||
}
|
||||
|
@ -480,7 +481,6 @@ public class TestPayloads extends TestCase {
|
|||
ingesters[i].join();
|
||||
} catch (InterruptedException e) {}
|
||||
}
|
||||
|
||||
writer.close();
|
||||
IndexReader reader = IndexReader.open(dir);
|
||||
TermEnum terms = reader.terms();
|
||||
|
|
|
@ -74,8 +74,6 @@ public class TestStressIndexing extends TestCase {
|
|||
count++;
|
||||
}
|
||||
|
||||
modifier.close();
|
||||
|
||||
} catch (Exception e) {
|
||||
System.out.println(e.toString());
|
||||
e.printStackTrace();
|
||||
|
@ -125,6 +123,9 @@ public class TestStressIndexing extends TestCase {
|
|||
IndexerThread indexerThread = new IndexerThread(modifier);
|
||||
indexerThread.start();
|
||||
|
||||
IndexerThread indexerThread2 = new IndexerThread(modifier);
|
||||
indexerThread2.start();
|
||||
|
||||
// Two searchers that constantly just re-instantiate the searcher:
|
||||
SearcherThread searcherThread1 = new SearcherThread(directory);
|
||||
searcherThread1.start();
|
||||
|
@ -133,9 +134,14 @@ public class TestStressIndexing extends TestCase {
|
|||
searcherThread2.start();
|
||||
|
||||
indexerThread.join();
|
||||
indexerThread2.join();
|
||||
searcherThread1.join();
|
||||
searcherThread2.join();
|
||||
|
||||
modifier.close();
|
||||
|
||||
assertTrue("hit unexpected exception in indexer", !indexerThread.failed);
|
||||
assertTrue("hit unexpected exception in indexer 2", !indexerThread2.failed);
|
||||
assertTrue("hit unexpected exception in search1", !searcherThread1.failed);
|
||||
assertTrue("hit unexpected exception in search2", !searcherThread2.failed);
|
||||
//System.out.println(" Writer: " + indexerThread.count + " iterations");
|
||||
|
|
Binary file not shown.
Binary file not shown.
|
@ -292,5 +292,79 @@ public class TestTermVectors extends TestCase {
|
|||
//System.out.println("Document: " + doc);
|
||||
}
|
||||
|
||||
// Test only a few docs having vectors
|
||||
public void testRareVectors() throws IOException {
|
||||
IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true);
|
||||
for(int i=0;i<100;i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("field", English.intToEnglish(i),
|
||||
Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.NO));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
for(int i=0;i<10;i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("field", English.intToEnglish(100+i),
|
||||
Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
writer.close();
|
||||
searcher = new IndexSearcher(directory);
|
||||
|
||||
Query query = new TermQuery(new Term("field", "hundred"));
|
||||
Hits hits = searcher.search(query);
|
||||
assertEquals(10, hits.length());
|
||||
for (int i = 0; i < hits.length(); i++) {
|
||||
TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(i));
|
||||
assertTrue(vector != null);
|
||||
assertTrue(vector.length == 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// In a single doc, for the same field, mix the term
|
||||
// vectors up
|
||||
public void testMixedVectrosVectors() throws IOException {
|
||||
IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true);
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("field", "one",
|
||||
Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.NO));
|
||||
doc.add(new Field("field", "one",
|
||||
Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES));
|
||||
doc.add(new Field("field", "one",
|
||||
Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS));
|
||||
doc.add(new Field("field", "one",
|
||||
Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_OFFSETS));
|
||||
doc.add(new Field("field", "one",
|
||||
Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
|
||||
writer.addDocument(doc);
|
||||
writer.close();
|
||||
|
||||
searcher = new IndexSearcher(directory);
|
||||
|
||||
Query query = new TermQuery(new Term("field", "one"));
|
||||
Hits hits = searcher.search(query);
|
||||
assertEquals(1, hits.length());
|
||||
|
||||
TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(0));
|
||||
assertTrue(vector != null);
|
||||
assertTrue(vector.length == 1);
|
||||
TermPositionVector tfv = (TermPositionVector) vector[0];
|
||||
assertTrue(tfv.getField().equals("field"));
|
||||
String[] terms = tfv.getTerms();
|
||||
assertEquals(1, terms.length);
|
||||
assertEquals(terms[0], "one");
|
||||
assertEquals(5, tfv.getTermFrequencies()[0]);
|
||||
|
||||
int[] positions = tfv.getTermPositions(0);
|
||||
assertEquals(5, positions.length);
|
||||
for(int i=0;i<5;i++)
|
||||
assertEquals(i, positions[i]);
|
||||
TermVectorOffsetInfo[] offsets = tfv.getOffsets(0);
|
||||
assertEquals(5, offsets.length);
|
||||
for(int i=0;i<5;i++) {
|
||||
assertEquals(4*i, offsets[i].getStartOffset());
|
||||
assertEquals(4*i+3, offsets[i].getEndOffset());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue