better compressed input offset data structure

2025-02-07 21:48:39 +00:00 · 2012-06-20 16:01:09 +02:00 · 2012-06-20 16:01:09 +02:00 · 9e6cfa77a5
commit 9e6cfa77a5
parent b009c9c652
3 changed files with 85 additions and 8 deletions
--- a/src/main/java/org/elasticsearch/common/compress/CompressedIndexInput.java
+++ b/src/main/java/org/elasticsearch/common/compress/CompressedIndexInput.java
@ -20,6 +20,7 @@
 package org.elasticsearch.common.compress;

 import org.apache.lucene.store.IndexInput;
+import org.elasticsearch.common.util.BigLongArray;

 import java.io.EOFException;
 import java.io.IOException;
@ -32,7 +33,7 @@ public abstract class CompressedIndexInput extends IndexInput {

    private int version;
    private long uncompressedLength;
-    private long[] offsets;
+    private BigLongArray offsets;

    private boolean closed;

@ -55,9 +56,9 @@ public abstract class CompressedIndexInput extends IndexInput {
        in.seek(metaDataPosition);
        this.uncompressedLength = in.readVLong();
        int size = in.readVInt();
-        offsets = new long[size];
-        for (int i = 0; i < offsets.length; i++) {
-            offsets[i] = in.readVLong();
+        offsets = new BigLongArray(size);
+        for (int i = 0; i < size; i++) {
+            offsets.set(i, in.readVLong());
        }
        this.currentOffsetIdx = -1;
        this.currentOffset = 0;
@ -137,7 +138,7 @@ public abstract class CompressedIndexInput extends IndexInput {
    @Override
    public void seek(long pos) throws IOException {
        int idx = (int) (pos / uncompressed.length);
-        if (idx >= offsets.length) {
+        if (idx >= offsets.size) {
            // set the next "readyBuffer" to EOF
            currentOffsetIdx = idx;
            position = 0;
@ -146,7 +147,7 @@ public abstract class CompressedIndexInput extends IndexInput {
        }

        // TODO: optimize so we won't have to readyBuffer on seek, can keep the position around, and set it on readyBuffer in this case
-        long pointer = offsets[idx];
+        long pointer = offsets.get(idx);
        if (pointer != currentOffset) {
            in.seek(pointer);
            position = 0;
@ -182,7 +183,7 @@ public abstract class CompressedIndexInput extends IndexInput {
            return false;
        }
        // we reached the end...
-        if (currentOffsetIdx + 1 >= offsets.length) {
+        if (currentOffsetIdx + 1 >= offsets.size) {
            return false;
        }
        valid = uncompress(in, uncompressed);
@ -190,7 +191,7 @@ public abstract class CompressedIndexInput extends IndexInput {
            return false;
        }
        currentOffsetIdx++;
-        currentOffset = offsets[currentOffsetIdx];
+        currentOffset = offsets.get(currentOffsetIdx);
        currentOffsetFilePointer = currentOffset - headerLength;
        position = 0;
        return (position < valid);
--- a/src/main/java/org/elasticsearch/common/compress/CompressedIndexOutput.java
+++ b/src/main/java/org/elasticsearch/common/compress/CompressedIndexOutput.java
@ -40,6 +40,7 @@ public abstract class CompressedIndexOutput extends IndexOutput {
    private boolean closed;

    private final long metaDataPointer;
+    // need to have a growing segment long array list here...
    private TLongArrayList offsets = new TLongArrayList();

    public CompressedIndexOutput(IndexOutput out) throws IOException {
--- a/src/main/java/org/elasticsearch/common/util/BigLongArray.java
+++ b/src/main/java/org/elasticsearch/common/util/BigLongArray.java
@ -0,0 +1,75 @@
+/*
+ * Licensed to ElasticSearch and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. ElasticSearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.common.util;
+
+/**
+ * A GC friendly long[].
+ * Allocating large arrays (that are not short-lived) generate fragmentation
+ * in old-gen space. This breaks such large long array into fixed size pages
+ * to avoid that problem.
+ */
+public class BigLongArray {
+
+    private static final int DEFAULT_PAGE_SIZE = 4096;
+
+    private final long[][] pages;
+    public final int size;
+
+    private final int pageSize;
+    private final int pageCount;
+
+    public BigLongArray(int size) {
+        this(size, DEFAULT_PAGE_SIZE);
+    }
+
+    public BigLongArray(int size, int pageSize) {
+        this.size = size;
+        this.pageSize = pageSize;
+
+        int lastPageSize = size % pageSize;
+        int fullPageCount = size / pageSize;
+        pageCount = fullPageCount + (lastPageSize == 0 ? 0 : 1);
+        pages = new long[pageCount][];
+
+        for (int i = 0; i < fullPageCount; ++i)
+            pages[i] = new long[pageSize];
+
+        if (lastPageSize != 0)
+            pages[pages.length - 1] = new long[lastPageSize];
+    }
+
+    public void set(int idx, long value) {
+        if (idx < 0 || idx > size)
+            throw new IndexOutOfBoundsException(String.format("%d is not whithin [0, %d)", idx, size));
+
+        int page = idx / pageSize;
+        int pageIdx = idx % pageSize;
+        pages[page][pageIdx] = value;
+    }
+
+    public long get(int idx) {
+        if (idx < 0 || idx > size)
+            throw new IndexOutOfBoundsException(String.format("%d is not whithin [0, %d)", idx, size));
+
+        int page = idx / pageSize;
+        int pageIdx = idx % pageSize;
+        return pages[page][pageIdx];
+    }
+}