From 00da65393cc56a933440796631686e5d113ca338 Mon Sep 17 00:00:00 2001
From: Michael McCandless <mikemccand@apache.org>
Date: Thu, 27 Oct 2011 20:49:13 +0000
Subject: [PATCH] LUCENE-2205: port to trunk in preflex codec; port
 TestPagedBytes and PagedBytes.DataInput/Output to trunk

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1190017 13f79535-47bb-0310-9956-ffa450edef68
---
 lucene/CHANGES.txt                            |   3 +
 .../index/codecs/preflex/TermInfosReader.java |  76 ++----
 .../codecs/preflex/TermInfosReaderIndex.java  | 252 ++++++++++++++++++
 .../lucene/store/ByteArrayDataInput.java      |   2 +-
 .../org/apache/lucene/util/PagedBytes.java    | 166 +++++++++++-
 .../preflex/TestTermInfosReaderIndex.java     | 193 ++++++++++++++
 .../apache/lucene/util/TestPagedBytes.java    |  64 +++++
 7 files changed, 693 insertions(+), 63 deletions(-)
 create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReaderIndex.java
 create mode 100644 lucene/src/test/org/apache/lucene/index/codecs/preflex/TestTermInfosReaderIndex.java
 create mode 100644 lucene/src/test/org/apache/lucene/util/TestPagedBytes.java

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index f3421e54a28..a3ecc854e7b 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -723,6 +723,9 @@ Optimizations
   FilteredQuery/IndexSearcher added by LUCENE-1536 to Lucene 4.0.
   (Uwe Schindler)
 
+* LUCENE-2205: Very substantial (3-5X) RAM reduction required to hold
+  the terms index on opening an IndexReader (Aaron McCurry via Mike McCandless)
+
 Test Cases
 
 * LUCENE-3420: Disable the finalness checks in TokenStream and Analyzer
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java
index 48352000236..cccfa88ab05 100644
--- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java
+++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java
@@ -47,9 +47,8 @@ public final class TermInfosReader {
   private final SegmentTermEnum origEnum;
   private final long size;
 
-  private final Term[] indexTerms;
-  private final TermInfo[] indexInfos;
-  private final long[] indexPointers;
+  private final TermInfosReaderIndex index;
+  private final int indexLength;
   
   private final int totalIndexInterval;
 
@@ -118,37 +117,23 @@ public final class TermInfosReader {
       if (indexDivisor != -1) {
         // Load terms index
         totalIndexInterval = origEnum.indexInterval * indexDivisor;
-        final SegmentTermEnum indexEnum = new SegmentTermEnum(directory.openInput(IndexFileNames.segmentFileName(segment, "", PreFlexCodec.TERMS_INDEX_EXTENSION),
-                                                                                  context), fieldInfos, true);
+
+        final String indexFileName = IndexFileNames.segmentFileName(segment, "", PreFlexCodec.TERMS_INDEX_EXTENSION);
+        final SegmentTermEnum indexEnum = new SegmentTermEnum(directory.openInput(indexFileName,
+                                                                                   context), fieldInfos, true);
 
         try {
-          int indexSize = 1+((int)indexEnum.size-1)/indexDivisor;  // otherwise read index
-
-          indexTerms = new Term[indexSize];
-          indexInfos = new TermInfo[indexSize];
-          indexPointers = new long[indexSize];
-
-          for (int i=0;indexEnum.next(); i++) {
-            indexTerms[i] = indexEnum.term();
-            assert indexTerms[i] != null;
-            assert indexTerms[i].text() != null;
-            assert indexTerms[i].field() != null;
-            indexInfos[i] = indexEnum.termInfo();
-            indexPointers[i] = indexEnum.indexPointer;
-        
-            for (int j = 1; j < indexDivisor; j++)
-              if (!indexEnum.next())
-                break;
-          }
+          // nocommit don't cast to int..
+          index = new TermInfosReaderIndex(indexEnum, indexDivisor, (int) dir.fileLength(indexFileName), totalIndexInterval);
+          indexLength = index.length();
         } finally {
           indexEnum.close();
         }
       } else {
         // Do not load terms index:
         totalIndexInterval = -1;
-        indexTerms = null;
-        indexInfos = null;
-        indexPointers = null;
+        index = null;
+        indexLength = -1;
       }
       success = true;
     } finally {
@@ -203,31 +188,6 @@ public final class TermInfosReader {
     }
   }
 
-  /** Returns the offset of the greatest index entry which is less than or equal to term.*/
-  private int getIndexOffset(Term term) {
-    int lo = 0;					  // binary search indexTerms[]
-    int hi = indexTerms.length - 1;
-
-    while (hi >= lo) {
-      int mid = (lo + hi) >>> 1;
-      assert indexTerms[mid] != null : "indexTerms = " + indexTerms.length + " mid=" + mid;
-      int delta = compareAsUTF16(term, indexTerms[mid]);
-      if (delta < 0)
-	hi = mid - 1;
-      else if (delta > 0)
-	lo = mid + 1;
-      else
-	return mid;
-    }
-    return hi;
-  }
-
-  private void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException {
-    enumerator.seek(indexPointers[indexOffset],
-                    ((long) indexOffset * totalIndexInterval) - 1,
-                    indexTerms[indexOffset], indexInfos[indexOffset]);
-  }
-
   /** Returns the TermInfo for a Term in the set, or null. */
   TermInfo get(Term term) throws IOException {
     return get(term, false);
@@ -272,8 +232,8 @@ public final class TermInfosReader {
 	&& ((enumerator.prev() != null && compareAsUTF16(term, enumerator.prev())> 0)
 	    || compareAsUTF16(term, enumerator.term()) >= 0)) {
       int enumOffset = (int)(enumerator.position/totalIndexInterval)+1;
-      if (indexTerms.length == enumOffset	  // but before end of block
-          || compareAsUTF16(term, indexTerms[enumOffset]) < 0) {
+      if (indexLength == enumOffset    // but before end of block
+    || index.compareTo(term, enumOffset) < 0) {
        // no need to seek
 
         final TermInfo ti;
@@ -309,10 +269,10 @@ public final class TermInfosReader {
       indexPos = (int) (tiOrd.termOrd / totalIndexInterval);
     } else {
       // Must do binary search:
-      indexPos = getIndexOffset(term);
+      indexPos = index.getIndexOffset(term);
     }
 
-    seekEnum(enumerator, indexPos);
+    index.seekEnum(enumerator, indexPos);
     enumerator.scanTo(term);
     final TermInfo ti;
 
@@ -352,7 +312,7 @@ public final class TermInfosReader {
   }
 
   private void ensureIndexIsRead() {
-    if (indexTerms == null) {
+    if (index == null) {
       throw new IllegalStateException("terms index was not loaded when this reader was created");
     }
   }
@@ -362,10 +322,10 @@ public final class TermInfosReader {
     if (size == 0) return -1;
 
     ensureIndexIsRead();
-    int indexOffset = getIndexOffset(term);
+    int indexOffset = index.getIndexOffset(term);
     
     SegmentTermEnum enumerator = getThreadResources().termEnum;
-    seekEnum(enumerator, indexOffset);
+    index.seekEnum(enumerator, indexOffset);
 
     while(compareAsUTF16(term, enumerator.term()) > 0 && enumerator.next()) {}
 
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReaderIndex.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReaderIndex.java
new file mode 100644
index 00000000000..d384ff9e0f1
--- /dev/null
+++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReaderIndex.java
@@ -0,0 +1,252 @@
+package org.apache.lucene.index.codecs.preflex;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.util.BitUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.PagedBytes.PagedBytesDataInput;
+import org.apache.lucene.util.PagedBytes.PagedBytesDataOutput;
+import org.apache.lucene.util.PagedBytes;
+import org.apache.lucene.util.packed.GrowableWriter;
+import org.apache.lucene.util.packed.PackedInts;
+
+/**
+ * This stores a monotonically increasing set of <Term, TermInfo> pairs in an
+ * index segment. Pairs are accessed either by Term or by ordinal position the
+ * set. The Terms and TermInfo are actually serialized and stored into a byte
+ * array and pointers to the position of each are stored in a int array.
+ */
+class TermInfosReaderIndex {
+
+  private static final int MAX_PAGE_BITS = 18; // 256 KB block
+  private Term[] fields;
+  private int totalIndexInterval;
+  private Comparator<BytesRef> comparator = BytesRef.getUTF8SortedAsUTF16Comparator();
+  private final PagedBytesDataInput dataInput;
+  private final PackedInts.Reader indexToDataOffset;
+  private final int indexSize;
+  private final int skipInterval;
+
+  /**
+   * Loads the segment information at segment load time.
+   * 
+   * @param indexEnum
+   *          the term enum.
+   * @param indexDivisor
+   *          the index divisor.
+   * @param tiiFileLength
+   *          the size of the tii file, used to approximate the size of the
+   *          buffer.
+   * @param totalIndexInterval
+   *          the total index interval.
+   */
+  TermInfosReaderIndex(SegmentTermEnum indexEnum, int indexDivisor, long tiiFileLength, int totalIndexInterval) throws IOException {
+    this.totalIndexInterval = totalIndexInterval;
+    indexSize = 1 + ((int) indexEnum.size - 1) / indexDivisor;
+    skipInterval = indexEnum.skipInterval;
+    // this is only an inital size, it will be GCed once the build is complete
+    long initialSize = (long) (tiiFileLength * 1.5) / indexDivisor;
+    PagedBytes dataPagedBytes = new PagedBytes(estimatePageBits(initialSize));
+    PagedBytesDataOutput dataOutput = dataPagedBytes.getDataOutput();
+
+    GrowableWriter indexToTerms = new GrowableWriter(4, indexSize, false);
+    String currentField = null;
+    List<String> fieldStrs = new ArrayList<String>();
+    int fieldCounter = -1;
+    for (int i = 0; indexEnum.next(); i++) {
+      Term term = indexEnum.term();
+      if (currentField == null || !currentField.equals(term.field())) {
+        currentField = term.field();
+        fieldStrs.add(currentField);
+        fieldCounter++;
+      }
+      TermInfo termInfo = indexEnum.termInfo();
+      indexToTerms.set(i, dataOutput.getPosition());
+      dataOutput.writeVInt(fieldCounter);
+      dataOutput.writeString(term.text());
+      dataOutput.writeVInt(termInfo.docFreq);
+      if (termInfo.docFreq >= skipInterval) {
+        dataOutput.writeVInt(termInfo.skipOffset);
+      }
+      dataOutput.writeVLong(termInfo.freqPointer);
+      dataOutput.writeVLong(termInfo.proxPointer);
+      dataOutput.writeVLong(indexEnum.indexPointer);
+      for (int j = 1; j < indexDivisor; j++) {
+        if (!indexEnum.next()) {
+          break;
+        }
+      }
+    }
+
+    fields = new Term[fieldStrs.size()];
+    for (int i = 0; i < fields.length; i++) {
+      fields[i] = new Term(fieldStrs.get(i));
+    }
+    
+    dataPagedBytes.freeze(true);
+    dataInput = dataPagedBytes.getDataInput();
+    indexToDataOffset = indexToTerms.getMutable();
+  }
+
+  private static int estimatePageBits(long estSize) {
+    return Math.max(Math.min(64 - BitUtil.nlz(estSize), MAX_PAGE_BITS), 4);
+  }
+
+  void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException {
+    PagedBytesDataInput input = (PagedBytesDataInput) dataInput.clone();
+    
+    input.setPosition(indexToDataOffset.get(indexOffset));
+
+    // read the term
+    int fieldId = input.readVInt();
+    Term field = fields[fieldId];
+    Term term = new Term(field.field(), input.readString());
+
+    // read the terminfo
+    TermInfo termInfo = new TermInfo();
+    termInfo.docFreq = input.readVInt();
+    if (termInfo.docFreq >= skipInterval) {
+      termInfo.skipOffset = input.readVInt();
+    } else {
+      termInfo.skipOffset = 0;
+    }
+    termInfo.freqPointer = input.readVLong();
+    termInfo.proxPointer = input.readVLong();
+
+    long pointer = input.readVLong();
+
+    // perform the seek
+    enumerator.seek(pointer, ((long) indexOffset * totalIndexInterval) - 1, term, termInfo);
+  }
+
+  /**
+   * Binary search for the given term.
+   * 
+   * @param term
+   *          the term to locate.
+   * @throws IOException 
+   */
+  int getIndexOffset(Term term) throws IOException {
+    int lo = 0;
+    int hi = indexSize - 1;
+    PagedBytesDataInput input = (PagedBytesDataInput) dataInput.clone();
+    BytesRef scratch = new BytesRef();
+    while (hi >= lo) {
+      int mid = (lo + hi) >>> 1;
+      int delta = compareTo(term, mid, input, scratch);
+      if (delta < 0)
+        hi = mid - 1;
+      else if (delta > 0)
+        lo = mid + 1;
+      else
+        return mid;
+    }
+    return hi;
+  }
+
+  /**
+   * Gets the term at the given position.  For testing.
+   * 
+   * @param termIndex
+   *          the position to read the term from the index.
+   * @return the term.
+   * @throws IOException
+   */
+  Term getTerm(int termIndex) throws IOException {
+    PagedBytesDataInput input = (PagedBytesDataInput) dataInput.clone();
+    input.setPosition(indexToDataOffset.get(termIndex));
+
+    // read the term
+    int fieldId = input.readVInt();
+    Term field = fields[fieldId];
+    return new Term(field.field(), input.readString());
+  }
+
+  /**
+   * Returns the number of terms.
+   * 
+   * @return int.
+   */
+  int length() {
+    return indexSize;
+  }
+
+  /**
+   * The compares the given term against the term in the index specified by the
+   * term index. ie It returns negative N when term is less than index term;
+   * 
+   * @param term
+   *          the given term.
+   * @param termIndex
+   *          the index of the of term to compare.
+   * @return int.
+   * @throws IOException 
+   */
+  int compareTo(Term term, int termIndex) throws IOException {
+    return compareTo(term, termIndex, (PagedBytesDataInput) dataInput.clone(), new BytesRef());
+  }
+
+  /**
+   * Compare the fields of the terms first, and if not equals return from
+   * compare. If equal compare terms.
+   * 
+   * @param term
+   *          the term to compare.
+   * @param termIndex
+   *          the position of the term in the input to compare
+   * @param input
+   *          the input buffer.
+   * @return int.
+   * @throws IOException 
+   */
+  private int compareTo(Term term, int termIndex, PagedBytesDataInput input, BytesRef reuse) throws IOException {
+    // if term field does not equal mid's field index, then compare fields
+    // else if they are equal, compare term's string values...
+    int c = compareField(term, termIndex, input);
+    if (c == 0) {
+      reuse.length = input.readVInt();
+      reuse.grow(reuse.length);
+      input.readBytes(reuse.bytes, 0, reuse.length);
+      return comparator.compare(term.bytes(), reuse);
+    }
+    return c;
+  }
+
+  /**
+   * Compares the fields before checking the text of the terms.
+   * 
+   * @param term
+   *          the given term.
+   * @param termIndex
+   *          the term that exists in the data block.
+   * @param input
+   *          the data block.
+   * @return int.
+   * @throws IOException 
+   */
+  private int compareField(Term term, int termIndex, PagedBytesDataInput input) throws IOException {
+    input.setPosition(indexToDataOffset.get(termIndex));
+    return term.field().compareTo(fields[input.readVInt()].field());
+  }
+}
diff --git a/lucene/src/java/org/apache/lucene/store/ByteArrayDataInput.java b/lucene/src/java/org/apache/lucene/store/ByteArrayDataInput.java
index 0779a168f33..b968f4eb298 100644
--- a/lucene/src/java/org/apache/lucene/store/ByteArrayDataInput.java
+++ b/lucene/src/java/org/apache/lucene/store/ByteArrayDataInput.java
@@ -52,7 +52,7 @@ public final class ByteArrayDataInput extends DataInput {
   public int getPosition() {
     return pos;
   }
-
+  
   public void setPosition(int pos) {
     this.pos = pos;
   }
diff --git a/lucene/src/java/org/apache/lucene/util/PagedBytes.java b/lucene/src/java/org/apache/lucene/util/PagedBytes.java
index d53e9b3140d..2e29464b90d 100644
--- a/lucene/src/java/org/apache/lucene/util/PagedBytes.java
+++ b/lucene/src/java/org/apache/lucene/util/PagedBytes.java
@@ -17,12 +17,14 @@ package org.apache.lucene.util;
  * limitations under the License.
  */
 
-import org.apache.lucene.store.IndexInput;
-
-import java.util.List;
-import java.util.ArrayList;
 import java.io.Closeable;
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexInput;
 
 /** Represents a logical byte[] as a series of pages.  You
  *  can write-once into the logical byte[] (append only),
@@ -37,6 +39,8 @@ public final class PagedBytes {
   private final int blockSize;
   private final int blockBits;
   private final int blockMask;
+  private boolean didSkipBytes;
+  private boolean frozen;
   private int upto;
   private byte[] currentBlock;
 
@@ -320,6 +324,7 @@ public final class PagedBytes {
       if (currentBlock != null) {
         blocks.add(currentBlock);
         blockEnd.add(upto);
+        didSkipBytes = true;
       }
       currentBlock = new byte[blockSize];
       upto = 0;
@@ -338,6 +343,12 @@ public final class PagedBytes {
 
   /** Commits final byte[], trimming it if necessary and if trim=true */
   public Reader freeze(boolean trim) {
+    if (frozen) {
+      throw new IllegalStateException("already frozen");
+    }
+    if (didSkipBytes) {
+      throw new IllegalStateException("cannot freeze when copy(BytesRef, BytesRef) was used");
+    }
     if (trim && upto < blockSize) {
       final byte[] newBlock = new byte[upto];
       System.arraycopy(currentBlock, 0, newBlock, 0, upto);
@@ -348,6 +359,7 @@ public final class PagedBytes {
     }
     blocks.add(currentBlock);
     blockEnd.add(upto); 
+    frozen = true;
     currentBlock = null;
     return new Reader(this);
   }
@@ -389,4 +401,150 @@ public final class PagedBytes {
 
     return pointer;
   }
+
+  public final class PagedBytesDataInput extends DataInput {
+    private int currentBlockIndex;
+    private int currentBlockUpto;
+    private byte[] currentBlock;
+
+    PagedBytesDataInput() {
+      currentBlock = blocks.get(0);
+    }
+
+    @Override
+    public Object clone() {
+      PagedBytesDataInput clone = getDataInput();
+      clone.setPosition(getPosition());
+      return clone;
+    }
+
+    /** Returns the current byte position. */
+    public long getPosition() {
+      return currentBlockIndex * blockSize + currentBlockUpto;
+    }
+  
+    /** Seek to a position previously obtained from
+     *  {@link #getPosition}. */
+    public void setPosition(long pos) {
+      currentBlockIndex = (int) (pos >> blockBits);
+      currentBlock = blocks.get(currentBlockIndex);
+      currentBlockUpto = (int) (pos & blockMask);
+    }
+
+    @Override
+    public byte readByte() {
+      if (currentBlockUpto == blockSize) {
+        nextBlock();
+      }
+      return currentBlock[currentBlockUpto++];
+    }
+
+    @Override
+    public void readBytes(byte[] b, int offset, int len) {
+      final int offsetEnd = offset + len;
+      while (true) {
+        final int blockLeft = blockSize - currentBlockUpto;
+        final int left = offsetEnd - offset;
+        if (blockLeft < left) {
+          System.arraycopy(currentBlock, currentBlockUpto,
+                           b, offset,
+                           blockLeft);
+          nextBlock();
+          offset += blockLeft;
+        } else {
+          // Last block
+          System.arraycopy(currentBlock, currentBlockUpto,
+                           b, offset,
+                           left);
+          currentBlockUpto += left;
+          break;
+        }
+      }
+    }
+
+    private void nextBlock() {
+      currentBlockIndex++;
+      currentBlockUpto = 0;
+      currentBlock = blocks.get(currentBlockIndex);
+    }
+  }
+
+  public final class PagedBytesDataOutput extends DataOutput {
+    @Override
+    public void writeByte(byte b) {
+      if (upto == blockSize) {
+        if (currentBlock != null) {
+          blocks.add(currentBlock);
+          blockEnd.add(upto);
+        }
+        currentBlock = new byte[blockSize];
+        upto = 0;
+      }
+      currentBlock[upto++] = b;
+    }
+
+    @Override
+    public void writeBytes(byte[] b, int offset, int length) throws IOException {
+      if (length == 0) {
+        return;
+      }
+
+      if (upto == blockSize) {
+        if (currentBlock != null) {
+          blocks.add(currentBlock);
+          blockEnd.add(upto);
+        }
+        currentBlock = new byte[blockSize];
+        upto = 0;
+      }
+          
+      final int offsetEnd = offset + length;
+      while(true) {
+        final int left = offsetEnd - offset;
+        final int blockLeft = blockSize - upto;
+        if (blockLeft < left) {
+          System.arraycopy(b, offset, currentBlock, upto, blockLeft);
+          blocks.add(currentBlock);
+          blockEnd.add(blockSize);
+          currentBlock = new byte[blockSize];
+          upto = 0;
+          offset += blockLeft;
+        } else {
+          // Last block
+          System.arraycopy(b, offset, currentBlock, upto, left);
+          upto += left;
+          break;
+        }
+      }
+    }
+
+    /** Return the current byte position. */
+    public long getPosition() {
+      if (currentBlock == null) {
+        return 0;
+      } else {
+        return blocks.size() * blockSize + upto;
+      }
+    }
+  }
+
+  /** Returns a DataInput to read values from this
+   *  PagedBytes instance. */
+  public PagedBytesDataInput getDataInput() {
+    if (!frozen) {
+      throw new IllegalStateException("must call freeze() before getDataInput");
+    }
+    return new PagedBytesDataInput();
+  }
+
+  /** Returns a DataOutput that you may use to write into
+   *  this PagedBytes instance.  If you do this, you should
+   *  not call the other writing methods (eg, copy);
+   *  results are undefined. */
+  public PagedBytesDataOutput getDataOutput() {
+    if (frozen) {
+      throw new IllegalStateException("cannot get DataOutput after freeze()");
+    }
+    return new PagedBytesDataOutput();
+  }
 }
diff --git a/lucene/src/test/org/apache/lucene/index/codecs/preflex/TestTermInfosReaderIndex.java b/lucene/src/test/org/apache/lucene/index/codecs/preflex/TestTermInfosReaderIndex.java
new file mode 100644
index 00000000000..d42efa35281
--- /dev/null
+++ b/lucene/src/test/org/apache/lucene/index/codecs/preflex/TestTermInfosReaderIndex.java
@@ -0,0 +1,193 @@
+package org.apache.lucene.index.codecs.preflex;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Random;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.FieldsEnum;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.LogMergePolicy;
+import org.apache.lucene.index.MultiFields;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.SegmentReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.index.codecs.Codec;
+import org.apache.lucene.index.codecs.CodecProvider;
+import org.apache.lucene.index.codecs.CoreCodecProvider;
+import org.apache.lucene.index.codecs.preflexrw.PreFlexRWCodec;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.LockObtainFailedException;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
+
+public class TestTermInfosReaderIndex extends LuceneTestCase {
+  
+  private static final int NUMBER_OF_DOCUMENTS = 1000;
+  private static final int NUMBER_OF_FIELDS = 100;
+  private TermInfosReaderIndex index;
+  private Directory directory;
+  private SegmentTermEnum termEnum;
+  private int indexDivisor;
+  private int termIndexInterval;
+  private IndexReader reader;
+  private List<Term> sampleTerms;
+  
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    indexDivisor = _TestUtil.nextInt(random, 1, 10);
+    directory = newDirectory();
+    termIndexInterval = populate(directory);
+
+    IndexReader r0 = IndexReader.open(directory);
+    SegmentReader r = (SegmentReader) r0.getSequentialSubReaders()[0];
+    String segment = r.getSegmentName();
+    r.close();
+
+    FieldInfos fieldInfos = new FieldInfos(directory, IndexFileNames.segmentFileName(segment, "", IndexFileNames.FIELD_INFOS_EXTENSION));
+    String segmentFileName = IndexFileNames.segmentFileName(segment, "", PreFlexCodec.TERMS_INDEX_EXTENSION);
+    long tiiFileLength = directory.fileLength(segmentFileName);
+    IndexInput input = directory.openInput(segmentFileName, newIOContext(random));
+    termEnum = new SegmentTermEnum(directory.openInput(IndexFileNames.segmentFileName(segment, "", PreFlexCodec.TERMS_EXTENSION), newIOContext(random)), fieldInfos, false);
+    int totalIndexInterval = termEnum.indexInterval * indexDivisor;
+    
+    SegmentTermEnum indexEnum = new SegmentTermEnum(input, fieldInfos, true);
+    index = new TermInfosReaderIndex(indexEnum, indexDivisor, tiiFileLength, totalIndexInterval);
+    indexEnum.close();
+    input.close();
+    
+    reader = IndexReader.open(directory);
+    sampleTerms = sample(reader,1000);
+    
+  }
+  
+  @Override
+  public void tearDown() throws Exception {
+    termEnum.close();
+    reader.close();
+    directory.close();
+    super.tearDown();
+  }
+  
+  public void testSeekEnum() throws CorruptIndexException, IOException {
+    int indexPosition = 3;
+    SegmentTermEnum clone = (SegmentTermEnum) termEnum.clone();
+    Term term = findTermThatWouldBeAtIndex(clone, indexPosition);
+    SegmentTermEnum enumerator = clone;
+    index.seekEnum(enumerator, indexPosition);
+    assertEquals(term, enumerator.term());
+    clone.close();
+  }
+  
+  public void testCompareTo() throws IOException {
+    Term term = new Term("field" + random.nextInt(NUMBER_OF_FIELDS) ,getText());
+    for (int i = 0; i < index.length(); i++) {
+      Term t = index.getTerm(i);
+      int compareTo = term.compareTo(t);
+      assertEquals(compareTo, index.compareTo(term, i));
+    }
+  }
+  
+  public void testRandomSearchPerformance() throws CorruptIndexException, IOException {
+    IndexSearcher searcher = new IndexSearcher(reader);
+    for (Term t : sampleTerms) {
+      TermQuery query = new TermQuery(t);
+      TopDocs topDocs = searcher.search(query, 10);
+      assertTrue(topDocs.totalHits > 0);
+    }
+    searcher.close();
+  }
+
+  private List<Term> sample(IndexReader reader, int size) throws IOException {
+    List<Term> sample = new ArrayList<Term>();
+    Random random = new Random();
+    FieldsEnum fieldsEnum = MultiFields.getFields(reader).iterator();
+    String field;
+    while((field = fieldsEnum.next()) != null) {
+      TermsEnum terms = fieldsEnum.terms();
+      while (terms.next() != null) {
+        if (sample.size() >= size) {
+          int pos = random.nextInt(size);
+          sample.set(pos, new Term(field, terms.term()));
+        } else {
+          sample.add(new Term(field, terms.term()));
+        }
+      }
+    }
+    Collections.shuffle(sample);
+    return sample;
+  }
+
+  private Term findTermThatWouldBeAtIndex(SegmentTermEnum termEnum, int index) throws IOException {
+    int termPosition = index * termIndexInterval * indexDivisor;
+    for (int i = 0; i < termPosition; i++) {
+      if (!termEnum.next()) {
+        fail("Should not have run out of terms.");
+      }
+    }
+    return termEnum.term();
+  }
+
+  private int populate(Directory directory) throws CorruptIndexException, LockObtainFailedException, IOException {
+    IndexWriterConfig config = newIndexWriterConfig(TEST_VERSION_CURRENT, 
+        new MockAnalyzer(random, MockTokenizer.KEYWORD, false));
+    CoreCodecProvider cp = new CoreCodecProvider();
+    cp.unregister(cp.lookup("PreFlex"));
+    cp.register(new PreFlexRWCodec());
+    cp.setDefaultFieldCodec("PreFlex");
+    config.setCodecProvider(cp);
+    // turn off compound file, this test will open some index files directly.
+    LogMergePolicy mp = newLogMergePolicy();
+    mp.setUseCompoundFile(false);
+    config.setMergePolicy(mp);
+
+    RandomIndexWriter writer = new RandomIndexWriter(random, directory, config);
+    for (int i = 0; i < NUMBER_OF_DOCUMENTS; i++) {
+      Document document = new Document();
+      for (int f = 0; f < NUMBER_OF_FIELDS; f++) {
+        document.add(newField("field" + f, getText(), StringField.TYPE_UNSTORED));
+      }
+      writer.addDocument(document);
+    }
+    writer.optimize();
+    writer.close();
+    return config.getTermIndexInterval();
+  }
+  
+  private String getText() {
+    return Long.toString(random.nextLong(),Character.MAX_RADIX);
+  }
+}
diff --git a/lucene/src/test/org/apache/lucene/util/TestPagedBytes.java b/lucene/src/test/org/apache/lucene/util/TestPagedBytes.java
new file mode 100644
index 00000000000..5205300b027
--- /dev/null
+++ b/lucene/src/test/org/apache/lucene/util/TestPagedBytes.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.util;
+
+import java.util.Arrays;
+
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
+
+public class TestPagedBytes extends LuceneTestCase {
+
+  public void testDataInputOutput() throws Exception {
+    for(int iter=0;iter<5*RANDOM_MULTIPLIER;iter++) {
+      final PagedBytes p = new PagedBytes(_TestUtil.nextInt(random, 1, 20));
+      final DataOutput out = p.getDataOutput();
+      final int numBytes = random.nextInt(10000000);
+
+      final byte[] answer = new byte[numBytes];
+      random.nextBytes(answer);
+      int written = 0;
+      while(written < numBytes) {
+        if (random.nextInt(10) == 7) {
+          out.writeByte(answer[written++]);
+        } else {
+          int chunk = Math.max(random.nextInt(1000), numBytes - written);
+          out.writeBytes(answer, written, chunk);
+          written += chunk;
+        }
+      }
+
+      p.freeze(random.nextBoolean());
+
+      final DataInput in = p.getDataInput();
+
+      final byte[] verify = new byte[numBytes];
+      int read = 0;
+      while(read < numBytes) {
+        if (random.nextInt(10) == 7) {
+          verify[read++] = in.readByte();
+        } else {
+          int chunk = Math.max(random.nextInt(1000), numBytes - read);
+          in.readBytes(verify, read, chunk);
+          read += chunk;
+        }
+      }
+      assertTrue(Arrays.equals(answer, verify));
+    }
+  }
+}