better compressed input offset data structure

2012-06-20 16:01:09 +02:00 · 2012-06-20 16:01:09 +02:00 · 9e6cfa77a5
parent b009c9c652
commit 9e6cfa77a5
3 changed files with 85 additions and 8 deletions
--- a/src/main/java/org/elasticsearch/common/compress/CompressedIndexInput.java
+++ b/src/main/java/org/elasticsearch/common/compress/CompressedIndexInput.java
@ -20,6 +20,7 @@
 package org.elasticsearch.common.compress;
 import org.apache.lucene.store.IndexInput;
 import org.elasticsearch.common.util.BigLongArray;
 import java.io.EOFException;
 import java.io.IOException;
@ -32,7 +33,7 @@ public abstract class CompressedIndexInput extends IndexInput {
    private int version;
    private long uncompressedLength;
-    private long[] offsets;
+    private BigLongArray offsets;
    private boolean closed;
@ -55,9 +56,9 @@ public abstract class CompressedIndexInput extends IndexInput {
        in.seek(metaDataPosition);
        this.uncompressedLength = in.readVLong();
        int size = in.readVInt();
-        offsets = new long[size];
+        offsets = new BigLongArray(size);
-        for (int i = 0; i < offsets.length; i++) {
+        for (int i = 0; i < size; i++) {
-            offsets[i] = in.readVLong();
+            offsets.set(i, in.readVLong());
        }
        this.currentOffsetIdx = -1;
        this.currentOffset = 0;
@ -137,7 +138,7 @@ public abstract class CompressedIndexInput extends IndexInput {
    @Override
    public void seek(long pos) throws IOException {
        int idx = (int) (pos / uncompressed.length);
-        if (idx >= offsets.length) {
+        if (idx >= offsets.size) {
            // set the next "readyBuffer" to EOF
            currentOffsetIdx = idx;
            position = 0;
@ -146,7 +147,7 @@ public abstract class CompressedIndexInput extends IndexInput {
        }
        // TODO: optimize so we won't have to readyBuffer on seek, can keep the position around, and set it on readyBuffer in this case
-        long pointer = offsets[idx];
+        long pointer = offsets.get(idx);
        if (pointer != currentOffset) {
            in.seek(pointer);
            position = 0;
@ -182,7 +183,7 @@ public abstract class CompressedIndexInput extends IndexInput {
            return false;
        }
        // we reached the end...
-        if (currentOffsetIdx + 1 >= offsets.length) {
+        if (currentOffsetIdx + 1 >= offsets.size) {
            return false;
        }
        valid = uncompress(in, uncompressed);
@ -190,7 +191,7 @@ public abstract class CompressedIndexInput extends IndexInput {
            return false;
        }
        currentOffsetIdx++;
-        currentOffset = offsets[currentOffsetIdx];
+        currentOffset = offsets.get(currentOffsetIdx);
        currentOffsetFilePointer = currentOffset - headerLength;
        position = 0;
        return (position < valid);
--- a/src/main/java/org/elasticsearch/common/compress/CompressedIndexOutput.java
+++ b/src/main/java/org/elasticsearch/common/compress/CompressedIndexOutput.java
@ -40,6 +40,7 @@ public abstract class CompressedIndexOutput extends IndexOutput {
    private boolean closed;
    private final long metaDataPointer;
    // need to have a growing segment long array list here...
    private TLongArrayList offsets = new TLongArrayList();
    public CompressedIndexOutput(IndexOutput out) throws IOException {
--- a/src/main/java/org/elasticsearch/common/util/BigLongArray.java
+++ b/src/main/java/org/elasticsearch/common/util/BigLongArray.java
@ -0,0 +1,75 @@
 /*
 * Licensed to ElasticSearch and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. ElasticSearch licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.common.util;
 /**
 * A GC friendly long[].
 * Allocating large arrays (that are not short-lived) generate fragmentation
 * in old-gen space. This breaks such large long array into fixed size pages
 * to avoid that problem.
 */
 public class BigLongArray {
    private static final int DEFAULT_PAGE_SIZE = 4096;
    private final long[][] pages;
    public final int size;
    private final int pageSize;
    private final int pageCount;
    public BigLongArray(int size) {
        this(size, DEFAULT_PAGE_SIZE);
    }
    public BigLongArray(int size, int pageSize) {
        this.size = size;
        this.pageSize = pageSize;
        int lastPageSize = size % pageSize;
        int fullPageCount = size / pageSize;
        pageCount = fullPageCount + (lastPageSize == 0 ? 0 : 1);
        pages = new long[pageCount][];
        for (int i = 0; i < fullPageCount; ++i)
            pages[i] = new long[pageSize];
        if (lastPageSize != 0)
            pages[pages.length - 1] = new long[lastPageSize];
    }
    public void set(int idx, long value) {
        if (idx < 0 || idx > size)
            throw new IndexOutOfBoundsException(String.format("%d is not whithin [0, %d)", idx, size));
        int page = idx / pageSize;
        int pageIdx = idx % pageSize;
        pages[page][pageIdx] = value;
    }
    public long get(int idx) {
        if (idx < 0 || idx > size)
            throw new IndexOutOfBoundsException(String.format("%d is not whithin [0, %d)", idx, size));
        int page = idx / pageSize;
        int pageIdx = idx % pageSize;
        return pages[page][pageIdx];
    }
 }