mirror of https://github.com/apache/lucene.git
LUCENE-1172: some small additional speedups for DocumentsWriter
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@627707 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ac32a540c3
commit
9f2e3cf7bc
|
@ -1020,6 +1020,59 @@ final class DocumentsWriter {
|
||||||
quickSort(postings, left + 1, hi);
|
quickSort(postings, left + 1, hi);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void quickSort(FieldData[] array, int lo, int hi) {
|
||||||
|
if (lo >= hi)
|
||||||
|
return;
|
||||||
|
|
||||||
|
int mid = (lo + hi) >>> 1;
|
||||||
|
|
||||||
|
if (array[lo].compareTo(array[mid]) > 0) {
|
||||||
|
FieldData tmp = array[lo];
|
||||||
|
array[lo] = array[mid];
|
||||||
|
array[mid] = tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (array[mid].compareTo(array[hi]) > 0) {
|
||||||
|
FieldData tmp = array[mid];
|
||||||
|
array[mid] = array[hi];
|
||||||
|
array[hi] = tmp;
|
||||||
|
|
||||||
|
if (array[lo].compareTo(array[mid]) > 0) {
|
||||||
|
FieldData tmp2 = array[lo];
|
||||||
|
array[lo] = array[mid];
|
||||||
|
array[mid] = tmp2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int left = lo + 1;
|
||||||
|
int right = hi - 1;
|
||||||
|
|
||||||
|
if (left >= right)
|
||||||
|
return;
|
||||||
|
|
||||||
|
FieldData partition = array[mid];
|
||||||
|
|
||||||
|
for (; ;) {
|
||||||
|
while (array[right].compareTo(partition) > 0)
|
||||||
|
--right;
|
||||||
|
|
||||||
|
while (left < right && array[left].compareTo(partition) <= 0)
|
||||||
|
++left;
|
||||||
|
|
||||||
|
if (left < right) {
|
||||||
|
FieldData tmp = array[left];
|
||||||
|
array[left] = array[right];
|
||||||
|
array[right] = tmp;
|
||||||
|
--right;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
quickSort(array, lo, left);
|
||||||
|
quickSort(array, left + 1, hi);
|
||||||
|
}
|
||||||
|
|
||||||
/** If there are fields we've seen but did not see again
|
/** If there are fields we've seen but did not see again
|
||||||
* in the last run, then free them up. Also reduce
|
* in the last run, then free them up. Also reduce
|
||||||
* postings hash size. */
|
* postings hash size. */
|
||||||
|
@ -1098,6 +1151,7 @@ final class DocumentsWriter {
|
||||||
throws IOException, AbortException {
|
throws IOException, AbortException {
|
||||||
|
|
||||||
final int numFields = numFieldData;
|
final int numFields = numFieldData;
|
||||||
|
assert clearLastVectorFieldName();
|
||||||
|
|
||||||
assert 0 == fdtLocal.length();
|
assert 0 == fdtLocal.length();
|
||||||
|
|
||||||
|
@ -1108,7 +1162,7 @@ final class DocumentsWriter {
|
||||||
// sort the subset of fields that have vectors
|
// sort the subset of fields that have vectors
|
||||||
// enabled; we could save [small amount of] CPU
|
// enabled; we could save [small amount of] CPU
|
||||||
// here.
|
// here.
|
||||||
Arrays.sort(fieldDataArray, 0, numFields);
|
quickSort(fieldDataArray, 0, numFields-1);
|
||||||
|
|
||||||
// We process the document one field at a time
|
// We process the document one field at a time
|
||||||
for(int i=0;i<numFields;i++)
|
for(int i=0;i<numFields;i++)
|
||||||
|
@ -1116,10 +1170,6 @@ final class DocumentsWriter {
|
||||||
|
|
||||||
if (maxTermPrefix != null && infoStream != null)
|
if (maxTermPrefix != null && infoStream != null)
|
||||||
infoStream.println("WARNING: document contains at least one immense term (longer than the max length " + MAX_TERM_LENGTH + "), all of which were skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense term is: '" + maxTermPrefix + "...'");
|
infoStream.println("WARNING: document contains at least one immense term (longer than the max length " + MAX_TERM_LENGTH + "), all of which were skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense term is: '" + maxTermPrefix + "...'");
|
||||||
|
|
||||||
if (ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH
|
|
||||||
&& numBytesUsed > 0.95 * ramBufferSize)
|
|
||||||
balanceRAM();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
final ByteBlockPool postingsPool = new ByteBlockPool();
|
final ByteBlockPool postingsPool = new ByteBlockPool();
|
||||||
|
@ -1295,6 +1345,26 @@ final class DocumentsWriter {
|
||||||
pos[posUpto++] = b;
|
pos[posUpto++] = b;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
String lastVectorFieldName;
|
||||||
|
|
||||||
|
// Called only by assert
|
||||||
|
final boolean clearLastVectorFieldName() {
|
||||||
|
lastVectorFieldName = null;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Called only by assert
|
||||||
|
final boolean vectorFieldsInOrder(FieldInfo fi) {
|
||||||
|
try {
|
||||||
|
if (lastVectorFieldName != null)
|
||||||
|
return lastVectorFieldName.compareTo(fi.name) < 0;
|
||||||
|
else
|
||||||
|
return true;
|
||||||
|
} finally {
|
||||||
|
lastVectorFieldName = fi.name;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
PostingVector[] postingsVectors = new PostingVector[1];
|
PostingVector[] postingsVectors = new PostingVector[1];
|
||||||
int maxPostingsVectors;
|
int maxPostingsVectors;
|
||||||
|
|
||||||
|
@ -1360,7 +1430,6 @@ final class DocumentsWriter {
|
||||||
postingsHash = new Posting[postingsHashSize];
|
postingsHash = new Posting[postingsHashSize];
|
||||||
}
|
}
|
||||||
|
|
||||||
/** So Arrays.sort can sort us. */
|
|
||||||
public int compareTo(Object o) {
|
public int compareTo(Object o) {
|
||||||
return fieldInfo.name.compareTo(((FieldData) o).fieldInfo.name);
|
return fieldInfo.name.compareTo(((FieldData) o).fieldInfo.name);
|
||||||
}
|
}
|
||||||
|
@ -1535,9 +1604,9 @@ final class DocumentsWriter {
|
||||||
|
|
||||||
/** Only called when term vectors are enabled. This
|
/** Only called when term vectors are enabled. This
|
||||||
* is called the first time we see a given term for
|
* is called the first time we see a given term for
|
||||||
* each * document, to allocate a PostingVector
|
* each document, to allocate a PostingVector
|
||||||
* instance that * is used to record data needed to
|
* instance that is used to record data needed to
|
||||||
* write the posting * vectors. */
|
* write the posting vectors. */
|
||||||
private PostingVector addNewVector() {
|
private PostingVector addNewVector() {
|
||||||
|
|
||||||
if (postingsVectorsUpto == postingsVectors.length) {
|
if (postingsVectorsUpto == postingsVectors.length) {
|
||||||
|
@ -1837,6 +1906,7 @@ final class DocumentsWriter {
|
||||||
void writeVectors(FieldInfo fieldInfo) throws IOException {
|
void writeVectors(FieldInfo fieldInfo) throws IOException {
|
||||||
|
|
||||||
assert fieldInfo.storeTermVector;
|
assert fieldInfo.storeTermVector;
|
||||||
|
assert vectorFieldsInOrder(fieldInfo);
|
||||||
|
|
||||||
vectorFieldNumbers[numVectorFields] = fieldInfo.number;
|
vectorFieldNumbers[numVectorFields] = fieldInfo.number;
|
||||||
vectorFieldPointers[numVectorFields] = tvfLocal.getFilePointer();
|
vectorFieldPointers[numVectorFields] = tvfLocal.getFilePointer();
|
||||||
|
@ -2586,6 +2656,10 @@ final class DocumentsWriter {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH
|
||||||
|
&& numBytesUsed >= ramBufferSize)
|
||||||
|
balanceRAM();
|
||||||
|
|
||||||
// Now write the indexed document to the real files.
|
// Now write the indexed document to the real files.
|
||||||
if (nextWriteDocID == state.docID) {
|
if (nextWriteDocID == state.docID) {
|
||||||
// It's my turn, so write everything now:
|
// It's my turn, so write everything now:
|
||||||
|
@ -2649,7 +2723,7 @@ final class DocumentsWriter {
|
||||||
out.writeByte(b);
|
out.writeByte(b);
|
||||||
}
|
}
|
||||||
|
|
||||||
byte[] copyByteBuffer = new byte[4096];
|
final byte[] copyByteBuffer = new byte[4096];
|
||||||
|
|
||||||
/** Copy numBytes from srcIn to destIn */
|
/** Copy numBytes from srcIn to destIn */
|
||||||
void copyBytes(IndexInput srcIn, IndexOutput destIn, long numBytes) throws IOException {
|
void copyBytes(IndexInput srcIn, IndexOutput destIn, long numBytes) throws IOException {
|
||||||
|
@ -3138,7 +3212,7 @@ final class DocumentsWriter {
|
||||||
* the other two. This method just frees allocations from
|
* the other two. This method just frees allocations from
|
||||||
* the pools once we are over-budget, which balances the
|
* the pools once we are over-budget, which balances the
|
||||||
* pools to match the current docs. */
|
* pools to match the current docs. */
|
||||||
private synchronized void balanceRAM() {
|
synchronized void balanceRAM() {
|
||||||
|
|
||||||
if (ramBufferSize == IndexWriter.DISABLE_AUTO_FLUSH || bufferIsFull)
|
if (ramBufferSize == IndexWriter.DISABLE_AUTO_FLUSH || bufferIsFull)
|
||||||
return;
|
return;
|
||||||
|
|
|
@ -23,7 +23,7 @@ import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.index.*;
|
import org.apache.lucene.index.*;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.RAMDirectory;
|
import org.apache.lucene.store.MockRAMDirectory;
|
||||||
import org.apache.lucene.util.English;
|
import org.apache.lucene.util.English;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -34,7 +34,7 @@ import java.util.SortedSet;
|
||||||
|
|
||||||
public class TestTermVectors extends LuceneTestCase {
|
public class TestTermVectors extends LuceneTestCase {
|
||||||
private IndexSearcher searcher;
|
private IndexSearcher searcher;
|
||||||
private RAMDirectory directory = new RAMDirectory();
|
private Directory directory = new MockRAMDirectory();
|
||||||
public TestTermVectors(String s) {
|
public TestTermVectors(String s) {
|
||||||
super(s);
|
super(s);
|
||||||
}
|
}
|
||||||
|
@ -91,6 +91,37 @@ public class TestTermVectors extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testTermVectorsFieldOrder() throws IOException {
|
||||||
|
Directory dir = new MockRAMDirectory();
|
||||||
|
IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true);
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new Field("c", "some content here", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
|
||||||
|
doc.add(new Field("a", "some content here", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
|
||||||
|
doc.add(new Field("b", "some content here", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
|
||||||
|
doc.add(new Field("x", "some content here", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
writer.close();
|
||||||
|
IndexReader reader = IndexReader.open(dir);
|
||||||
|
TermFreqVector[] v = reader.getTermFreqVectors(0);
|
||||||
|
assertEquals(4, v.length);
|
||||||
|
String[] expectedFields = new String[]{"a", "b", "c", "x"};
|
||||||
|
int[] expectedPositions = new int[]{1, 2, 0};
|
||||||
|
for(int i=0;i<v.length;i++) {
|
||||||
|
TermPositionVector posVec = (TermPositionVector) v[i];
|
||||||
|
assertEquals(expectedFields[i], posVec.getField());
|
||||||
|
String[] terms = posVec.getTerms();
|
||||||
|
assertEquals(3, terms.length);
|
||||||
|
assertEquals("content", terms[0]);
|
||||||
|
assertEquals("here", terms[1]);
|
||||||
|
assertEquals("some", terms[2]);
|
||||||
|
for(int j=0;j<3;j++) {
|
||||||
|
int[] positions = posVec.getTermPositions(j);
|
||||||
|
assertEquals(1, positions.length);
|
||||||
|
assertEquals(expectedPositions[j], positions[0]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void testTermPositionVectors() {
|
public void testTermPositionVectors() {
|
||||||
Query query = new TermQuery(new Term("field", "zero"));
|
Query query = new TermQuery(new Term("field", "zero"));
|
||||||
try {
|
try {
|
||||||
|
@ -198,7 +229,7 @@ public class TestTermVectors extends LuceneTestCase {
|
||||||
Document testDoc4 = new Document();
|
Document testDoc4 = new Document();
|
||||||
setupDoc(testDoc4, test4);
|
setupDoc(testDoc4, test4);
|
||||||
|
|
||||||
Directory dir = new RAMDirectory();
|
Directory dir = new MockRAMDirectory();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true,
|
IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true,
|
||||||
|
@ -310,10 +341,10 @@ public class TestTermVectors extends LuceneTestCase {
|
||||||
|
|
||||||
private void setupDoc(Document doc, String text)
|
private void setupDoc(Document doc, String text)
|
||||||
{
|
{
|
||||||
doc.add(new Field("field", text, Field.Store.YES,
|
|
||||||
Field.Index.TOKENIZED, Field.TermVector.YES));
|
|
||||||
doc.add(new Field("field2", text, Field.Store.YES,
|
doc.add(new Field("field2", text, Field.Store.YES,
|
||||||
Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
|
Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
|
||||||
|
doc.add(new Field("field", text, Field.Store.YES,
|
||||||
|
Field.Index.TOKENIZED, Field.TermVector.YES));
|
||||||
//System.out.println("Document: " + doc);
|
//System.out.println("Document: " + doc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue