use less ram for SimpleText stored fields and vectors

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1395736 13f79535-47bb-0310-9956-ffa450edef68
2012-10-08 19:31:33 +00:00 · 2012-10-08 19:31:33 +00:00 · 42ddef3b9e
parent 0933faf06b
commit 42ddef3b9e
2 changed files with 21 additions and 17 deletions
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextStoredFieldsReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextStoredFieldsReader.java
@ -18,7 +18,6 @@ package org.apache.lucene.codecs.simpletext;
 */

 import java.io.IOException;
-import java.util.ArrayList;

 import org.apache.lucene.codecs.StoredFieldsReader;
 import org.apache.lucene.index.FieldInfo;
@ -46,7 +45,7 @@ import static org.apache.lucene.codecs.simpletext.SimpleTextStoredFieldsWriter.*
 * @lucene.experimental
 */
 public class SimpleTextStoredFieldsReader extends StoredFieldsReader {
-  private ArrayList<Long> offsets; /* docid -> offset in .fld file */
+  private long offsets[]; /* docid -> offset in .fld file */
  private IndexInput in;
  private BytesRef scratch = new BytesRef();
  private CharsRef scratchUTF16 = new CharsRef();
@ -65,11 +64,11 @@ public class SimpleTextStoredFieldsReader extends StoredFieldsReader {
        } catch (Throwable t) {} // ensure we throw our original exception
      }
    }
-    readIndex();
+    readIndex(si.getDocCount());
  }
  
  // used by clone
-  SimpleTextStoredFieldsReader(ArrayList<Long> offsets, IndexInput in, FieldInfos fieldInfos) {
+  SimpleTextStoredFieldsReader(long offsets[], IndexInput in, FieldInfos fieldInfos) {
    this.offsets = offsets;
    this.in = in;
    this.fieldInfos = fieldInfos;
@ -78,19 +77,22 @@ public class SimpleTextStoredFieldsReader extends StoredFieldsReader {
  // we don't actually write a .fdx-like index, instead we read the 
  // stored fields file in entirety up-front and save the offsets 
  // so we can seek to the documents later.
-  private void readIndex() throws IOException {
-    offsets = new ArrayList<Long>();
+  private void readIndex(int size) throws IOException {
+    offsets = new long[size];
+    int upto = 0;
    while (!scratch.equals(END)) {
      readLine();
      if (StringHelper.startsWith(scratch, DOC)) {
-        offsets.add(in.getFilePointer());
+        offsets[upto] = in.getFilePointer();
+        upto++;
      }
    }
+    assert upto == offsets.length;
  }
  
  @Override
  public void visitDocument(int n, StoredFieldVisitor visitor) throws IOException {
-    in.seek(offsets.get(n));
+    in.seek(offsets[n]);
    readLine();
    assert StringHelper.startsWith(scratch, NUM);
    int numFields = parseIntAt(NUM.length);
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextTermVectorsReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextTermVectorsReader.java
@ -18,7 +18,6 @@ package org.apache.lucene.codecs.simpletext;
 */

 import java.io.IOException;
-import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.Iterator;
@ -54,7 +53,7 @@ import static org.apache.lucene.codecs.simpletext.SimpleTextTermVectorsWriter.*;
 * @lucene.experimental
 */
 public class SimpleTextTermVectorsReader extends TermVectorsReader {
-  private ArrayList<Long> offsets; /* docid -> offset in .vec file */
+  private long offsets[]; /* docid -> offset in .vec file */
  private IndexInput in;
  private BytesRef scratch = new BytesRef();
  private CharsRef scratchUTF16 = new CharsRef();
@ -71,11 +70,11 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
        } catch (Throwable t) {} // ensure we throw our original exception
      }
    }
-    readIndex();
+    readIndex(si.getDocCount());
  }
  
  // used by clone
-  SimpleTextTermVectorsReader(ArrayList<Long> offsets, IndexInput in) {
+  SimpleTextTermVectorsReader(long offsets[], IndexInput in) {
    this.offsets = offsets;
    this.in = in;
  }
@ -83,26 +82,29 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
  // we don't actually write a .tvx-like index, instead we read the 
  // vectors file in entirety up-front and save the offsets 
  // so we can seek to the data later.
-  private void readIndex() throws IOException {
-    offsets = new ArrayList<Long>();
+  private void readIndex(int maxDoc) throws IOException {
+    offsets = new long[maxDoc];
+    int upto = 0;
    while (!scratch.equals(END)) {
      readLine();
      if (StringHelper.startsWith(scratch, DOC)) {
-        offsets.add(in.getFilePointer());
+        offsets[upto] = in.getFilePointer();
+        upto++;
      }
    }
+    assert upto == offsets.length;
  }
  
  @Override
  public Fields get(int doc) throws IOException {
    // TestTV tests for this in testBadParams... but is this
    // really guaranteed by the API?
-    if (doc < 0 || doc >= offsets.size()) {
+    if (doc < 0 || doc >= offsets.length) {
      throw new IllegalArgumentException("doc id out of range");
    }

    SortedMap<String,SimpleTVTerms> fields = new TreeMap<String,SimpleTVTerms>();
-    in.seek(offsets.get(doc));
+    in.seek(offsets[doc]);
    readLine();
    assert StringHelper.startsWith(scratch, NUMFIELDS);
    int numFields = parseIntAt(NUMFIELDS.length);