LUCENE-5611: indexing optimizations, dont compute CRC for internal-use of RAMOutputStream, dont do heavy per-term stuff in skipper until we actually must buffer skipdata

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5611@1590858 13f79535-47bb-0310-9956-ffa450edef68
2014-04-29 01:49:24 +00:00 · 2014-04-29 01:49:24 +00:00 · c4b632ef7f
parent 48e9fa5e60
commit c4b632ef7f
6 changed files with 58 additions and 19 deletions
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41SkipWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41SkipWriter.java
@ -88,27 +88,51 @@ final class Lucene41SkipWriter extends MultiLevelSkipListWriter {
    this.fieldHasOffsets = fieldHasOffsets;
    this.fieldHasPayloads = fieldHasPayloads;
  }
+  
+  // tricky: we only skip data for blocks (terms with more than 128 docs), but re-init'ing the skipper 
+  // is pretty slow for rare terms in large segments as we have to fill O(log #docs in segment) of junk.
+  // this is the vast majority of terms (worst case: ID field or similar).  so in resetSkip() we save 
+  // away the previous pointers, and lazy-init only if we need to buffer skip data for the term.
+  private boolean initialized;
+  long lastDocFP;
+  long lastPosFP;
+  long lastPayFP;

  @Override
  public void resetSkip() {
-    super.resetSkip();
-    Arrays.fill(lastSkipDoc, 0);
-    Arrays.fill(lastSkipDocPointer, docOut.getFilePointer());
+    lastDocFP = docOut.getFilePointer();
    if (fieldHasPositions) {
-      Arrays.fill(lastSkipPosPointer, posOut.getFilePointer());
-      if (fieldHasPayloads) {
-        Arrays.fill(lastPayloadByteUpto, 0);
-      }
+      lastPosFP = posOut.getFilePointer();
      if (fieldHasOffsets || fieldHasPayloads) {
-        Arrays.fill(lastSkipPayPointer, payOut.getFilePointer());
+        lastPayFP = payOut.getFilePointer();
      }
    }
+    initialized = false;
+  }
+  
+  public void initSkip() {
+    if (!initialized) {
+      super.resetSkip();
+      Arrays.fill(lastSkipDoc, 0);
+      Arrays.fill(lastSkipDocPointer, lastDocFP);
+      if (fieldHasPositions) {
+        Arrays.fill(lastSkipPosPointer, lastPosFP);
+        if (fieldHasPayloads) {
+          Arrays.fill(lastPayloadByteUpto, 0);
+        }
+        if (fieldHasOffsets || fieldHasPayloads) {
+          Arrays.fill(lastSkipPayPointer, lastPayFP);
+        }
+      }
+      initialized = true;
+    }
  }

  /**
   * Sets the values for the current skip data. 
   */
  public void bufferSkip(int doc, int numDocs, long posFP, long payFP, int posBufferUpto, int payloadByteUpto) throws IOException {
+    initSkip();
    this.curDoc = doc;
    this.curDocPointer = docOut.getFilePointer();
    this.curPosPointer = posFP;
--- a/lucene/core/src/java/org/apache/lucene/index/PrefixCodedTerms.java
+++ b/lucene/core/src/java/org/apache/lucene/index/PrefixCodedTerms.java
@ -97,7 +97,7 @@ class PrefixCodedTerms implements Iterable<Term> {
  /** Builds a PrefixCodedTerms: call add repeatedly, then finish. */
  public static class Builder {
    private RAMFile buffer = new RAMFile();
-    private RAMOutputStream output = new RAMOutputStream(buffer);
+    private RAMOutputStream output = new RAMOutputStream(buffer, false);
    private Term lastTerm = new Term("");

    /** add a term */
--- a/lucene/core/src/java/org/apache/lucene/store/RAMDirectory.java
+++ b/lucene/core/src/java/org/apache/lucene/store/RAMDirectory.java
@ -171,7 +171,7 @@ public class RAMDirectory extends BaseDirectory {
      existing.directory = null;
    }
    fileMap.put(name, file);
-    return new RAMOutputStream(file);
+    return new RAMOutputStream(file, true);
  }

  /**
--- a/lucene/core/src/java/org/apache/lucene/store/RAMOutputStream.java
+++ b/lucene/core/src/java/org/apache/lucene/store/RAMOutputStream.java
@ -38,20 +38,25 @@ public class RAMOutputStream extends IndexOutput {
  private long bufferStart;
  private int bufferLength;
  
-  private Checksum crc = new BufferedChecksum(new CRC32());
+  private final Checksum crc;

  /** Construct an empty output buffer. */
  public RAMOutputStream() {
-    this(new RAMFile());
+    this(new RAMFile(), false);
  }

-  public RAMOutputStream(RAMFile f) {
+  public RAMOutputStream(RAMFile f, boolean checksum) {
    file = f;

    // make sure that we switch to the
    // first needed buffer lazily
    currentBufferIndex = -1;
    currentBuffer = null;
+    if (checksum) {
+      crc = new BufferedChecksum(new CRC32());
+    } else {
+      crc = null;
+    }
  }

  /** Copy the current contents of this buffer to the named output. */
@ -99,7 +104,9 @@ public class RAMOutputStream extends IndexOutput {
    bufferStart = 0;
    bufferLength = 0;
    file.setLength(0);
-    crc.reset();
+    if (crc != null) {
+      crc.reset();
+    }
  }

  @Override
@ -113,14 +120,18 @@ public class RAMOutputStream extends IndexOutput {
      currentBufferIndex++;
      switchCurrentBuffer();
    }
-    crc.update(b);
+    if (crc != null) {
+      crc.update(b);
+    }
    currentBuffer[bufferPosition++] = b;
  }

  @Override
  public void writeBytes(byte[] b, int offset, int len) throws IOException {
    assert b != null;
-    crc.update(b, offset, len);
+    if (crc != null) {
+      crc.update(b, offset, len);
+    }
    while (len > 0) {
      if (bufferPosition ==  bufferLength) {
        currentBufferIndex++;
@ -171,6 +182,10 @@ public class RAMOutputStream extends IndexOutput {

  @Override
  public long getChecksum() throws IOException {
-    return crc.getValue();
+    if (crc == null) {
+      throw new IllegalStateException("internal RAMOutputStream created with checksum disabled");
+    } else {
+      return crc.getValue();
+    }
  }
 }
--- a/lucene/core/src/test/org/apache/lucene/store/TestHugeRamFile.java
+++ b/lucene/core/src/test/org/apache/lucene/store/TestHugeRamFile.java
@ -54,7 +54,7 @@ public class TestHugeRamFile extends LuceneTestCase {
  public void testHugeFile() throws IOException {
    DenseRAMFile f = new DenseRAMFile();
    // output part
-    RAMOutputStream out = new RAMOutputStream(f);
+    RAMOutputStream out = new RAMOutputStream(f, true);
    byte b1[] = new byte[RAMOutputStream.BUFFER_SIZE];
    byte b2[] = new byte[RAMOutputStream.BUFFER_SIZE / 3];
    for (int i = 0; i < b1.length; i++) {
--- a/lucene/misc/src/java/org/apache/lucene/index/sorter/SortingAtomicReader.java
+++ b/lucene/misc/src/java/org/apache/lucene/index/sorter/SortingAtomicReader.java
@ -582,7 +582,7 @@ public class SortingAtomicReader extends FilterAtomicReader {
        file = new RAMFile();
        sorter = new DocOffsetSorter(maxDoc);
      }
-      final IndexOutput out = new RAMOutputStream(file);
+      final IndexOutput out = new RAMOutputStream(file, false);
      int doc;
      int i = 0;
      while ((doc = in.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {