LUCENE-1172: some small additional speedups for DocumentsWriter

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@627707 13f79535-47bb-0310-9956-ffa450edef68
2008-02-14 11:22:04 +00:00 · 2008-02-14 11:22:04 +00:00 · 9f2e3cf7bc
parent ac32a540c3
commit 9f2e3cf7bc
2 changed files with 121 additions and 16 deletions
--- a/src/java/org/apache/lucene/index/DocumentsWriter.java
+++ b/src/java/org/apache/lucene/index/DocumentsWriter.java
@ -1020,6 +1020,59 @@ final class DocumentsWriter {
      quickSort(postings, left + 1, hi);
    }
    void quickSort(FieldData[] array, int lo, int hi) {
      if (lo >= hi)
        return;
      int mid = (lo + hi) >>> 1;
      if (array[lo].compareTo(array[mid]) > 0) {
        FieldData tmp = array[lo];
        array[lo] = array[mid];
        array[mid] = tmp;
      }
      if (array[mid].compareTo(array[hi]) > 0) {
        FieldData tmp = array[mid];
        array[mid] = array[hi];
        array[hi] = tmp;
        if (array[lo].compareTo(array[mid]) > 0) {
          FieldData tmp2 = array[lo];
          array[lo] = array[mid];
          array[mid] = tmp2;
        }
      }
      int left = lo + 1;
      int right = hi - 1;
      if (left >= right)
        return;
      FieldData partition = array[mid];
      for (; ;) {
        while (array[right].compareTo(partition) > 0)
          --right;
        while (left < right && array[left].compareTo(partition) <= 0)
          ++left;
        if (left < right) {
          FieldData tmp = array[left];
          array[left] = array[right];
          array[right] = tmp;
          --right;
        } else {
          break;
        }
      }
      quickSort(array, lo, left);
      quickSort(array, left + 1, hi);
    }
    /** If there are fields we've seen but did not see again
     *  in the last run, then free them up.  Also reduce
     *  postings hash size. */
@ -1098,6 +1151,7 @@ final class DocumentsWriter {
      throws IOException, AbortException {
      final int numFields = numFieldData;
      assert clearLastVectorFieldName();
      assert 0 == fdtLocal.length();
@ -1108,7 +1162,7 @@ final class DocumentsWriter {
        // sort the subset of fields that have vectors
        // enabled; we could save [small amount of] CPU
        // here.
-        Arrays.sort(fieldDataArray, 0, numFields);
+        quickSort(fieldDataArray, 0, numFields-1);
      // We process the document one field at a time
      for(int i=0;i<numFields;i++)
@ -1116,10 +1170,6 @@ final class DocumentsWriter {
      if (maxTermPrefix != null && infoStream != null)
        infoStream.println("WARNING: document contains at least one immense term (longer than the max length " + MAX_TERM_LENGTH + "), all of which were skipped.  Please correct the analyzer to not produce such terms.  The prefix of the first immense term is: '" + maxTermPrefix + "...'"); 
      if (ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH
          && numBytesUsed > 0.95 * ramBufferSize)
        balanceRAM();
    }
    final ByteBlockPool postingsPool = new ByteBlockPool();
@ -1295,6 +1345,26 @@ final class DocumentsWriter {
      pos[posUpto++] = b;
    }
    String lastVectorFieldName;
    // Called only by assert
    final boolean clearLastVectorFieldName() {
      lastVectorFieldName = null;
      return true;
    }
    // Called only by assert
    final boolean vectorFieldsInOrder(FieldInfo fi) {
      try {
        if (lastVectorFieldName != null)
          return lastVectorFieldName.compareTo(fi.name) < 0;
        else
          return true;
      } finally {
        lastVectorFieldName = fi.name;
      }
    }
    PostingVector[] postingsVectors = new PostingVector[1];
    int maxPostingsVectors;
@ -1360,7 +1430,6 @@ final class DocumentsWriter {
        postingsHash = new Posting[postingsHashSize];
      }
      /** So Arrays.sort can sort us. */
      public int compareTo(Object o) {
        return fieldInfo.name.compareTo(((FieldData) o).fieldInfo.name);
      }
@ -1535,9 +1604,9 @@ final class DocumentsWriter {
      /** Only called when term vectors are enabled.  This
       *  is called the first time we see a given term for
-       *  each * document, to allocate a PostingVector
+       *  each document, to allocate a PostingVector
-       *  instance that * is used to record data needed to
+       *  instance that is used to record data needed to
-       *  write the posting * vectors. */
+       *  write the posting vectors. */
      private PostingVector addNewVector() {
        if (postingsVectorsUpto == postingsVectors.length) {
@ -1837,6 +1906,7 @@ final class DocumentsWriter {
      void writeVectors(FieldInfo fieldInfo) throws IOException {
        assert fieldInfo.storeTermVector;
        assert vectorFieldsInOrder(fieldInfo);
        vectorFieldNumbers[numVectorFields] = fieldInfo.number;
        vectorFieldPointers[numVectorFields] = tvfLocal.getFilePointer();
@ -2586,6 +2656,10 @@ final class DocumentsWriter {
      return;
    }
    if (ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH
        && numBytesUsed >= ramBufferSize)
      balanceRAM();
    // Now write the indexed document to the real files.
    if (nextWriteDocID == state.docID) {
      // It's my turn, so write everything now:
@ -2649,7 +2723,7 @@ final class DocumentsWriter {
      out.writeByte(b);
  }
-  byte[] copyByteBuffer = new byte[4096];
+  final byte[] copyByteBuffer = new byte[4096];
  /** Copy numBytes from srcIn to destIn */
  void copyBytes(IndexInput srcIn, IndexOutput destIn, long numBytes) throws IOException {
@ -3138,7 +3212,7 @@ final class DocumentsWriter {
   * the other two.  This method just frees allocations from
   * the pools once we are over-budget, which balances the
   * pools to match the current docs. */
-  private synchronized void balanceRAM() {
+  synchronized void balanceRAM() {
    if (ramBufferSize == IndexWriter.DISABLE_AUTO_FLUSH || bufferIsFull)
      return;
--- a/src/test/org/apache/lucene/search/TestTermVectors.java
+++ b/src/test/org/apache/lucene/search/TestTermVectors.java
@ -23,7 +23,7 @@ import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.index.*;
 import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.store.MockRAMDirectory;
 import org.apache.lucene.util.English;
 import java.io.IOException;
@ -34,7 +34,7 @@ import java.util.SortedSet;
 public class TestTermVectors extends LuceneTestCase {
  private IndexSearcher searcher;
-  private RAMDirectory directory = new RAMDirectory();
+  private Directory directory = new MockRAMDirectory();
  public TestTermVectors(String s) {
    super(s);
  }
@ -91,6 +91,37 @@ public class TestTermVectors extends LuceneTestCase {
    }
  }
  public void testTermVectorsFieldOrder() throws IOException {
    Directory dir = new MockRAMDirectory();
    IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true);
    Document doc = new Document();
    doc.add(new Field("c", "some content here", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
    doc.add(new Field("a", "some content here", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
    doc.add(new Field("b", "some content here", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
    doc.add(new Field("x", "some content here", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
    writer.addDocument(doc);
    writer.close();
    IndexReader reader = IndexReader.open(dir);
    TermFreqVector[] v = reader.getTermFreqVectors(0);
    assertEquals(4, v.length);
    String[] expectedFields = new String[]{"a", "b", "c", "x"};
    int[] expectedPositions = new int[]{1, 2, 0};
    for(int i=0;i<v.length;i++) {
      TermPositionVector posVec = (TermPositionVector) v[i];
      assertEquals(expectedFields[i], posVec.getField());
      String[] terms = posVec.getTerms();
      assertEquals(3, terms.length);
      assertEquals("content", terms[0]);
      assertEquals("here", terms[1]);
      assertEquals("some", terms[2]);
      for(int j=0;j<3;j++) {
        int[] positions = posVec.getTermPositions(j);
        assertEquals(1, positions.length);
        assertEquals(expectedPositions[j], positions[0]);
      }
    }
  }
  public void testTermPositionVectors() {
    Query query = new TermQuery(new Term("field", "zero"));
    try {
@ -198,7 +229,7 @@ public class TestTermVectors extends LuceneTestCase {
    Document testDoc4 = new Document();
    setupDoc(testDoc4, test4);
-    Directory dir = new RAMDirectory();
+    Directory dir = new MockRAMDirectory();
    try {
      IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true, 
@ -310,10 +341,10 @@ public class TestTermVectors extends LuceneTestCase {
  private void setupDoc(Document doc, String text)
  {
    doc.add(new Field("field", text, Field.Store.YES,
        Field.Index.TOKENIZED, Field.TermVector.YES));
    doc.add(new Field("field2", text, Field.Store.YES,
        Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
    doc.add(new Field("field", text, Field.Store.YES,
        Field.Index.TOKENIZED, Field.TermVector.YES));
    //System.out.println("Document: " + doc);
  }