From 9e6cfa77a5ea70307fe019da461287b03f0b1994 Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Wed, 20 Jun 2012 16:01:09 +0200 Subject: [PATCH] better compressed input offset data structure --- .../common/compress/CompressedIndexInput.java | 17 +++-- .../compress/CompressedIndexOutput.java | 1 + .../common/util/BigLongArray.java | 75 +++++++++++++++++++ 3 files changed, 85 insertions(+), 8 deletions(-) create mode 100644 src/main/java/org/elasticsearch/common/util/BigLongArray.java diff --git a/src/main/java/org/elasticsearch/common/compress/CompressedIndexInput.java b/src/main/java/org/elasticsearch/common/compress/CompressedIndexInput.java index 9fe1505ed96..726dca7a10a 100644 --- a/src/main/java/org/elasticsearch/common/compress/CompressedIndexInput.java +++ b/src/main/java/org/elasticsearch/common/compress/CompressedIndexInput.java @@ -20,6 +20,7 @@ package org.elasticsearch.common.compress; import org.apache.lucene.store.IndexInput; +import org.elasticsearch.common.util.BigLongArray; import java.io.EOFException; import java.io.IOException; @@ -32,7 +33,7 @@ public abstract class CompressedIndexInput extends IndexInput { private int version; private long uncompressedLength; - private long[] offsets; + private BigLongArray offsets; private boolean closed; @@ -55,9 +56,9 @@ public abstract class CompressedIndexInput extends IndexInput { in.seek(metaDataPosition); this.uncompressedLength = in.readVLong(); int size = in.readVInt(); - offsets = new long[size]; - for (int i = 0; i < offsets.length; i++) { - offsets[i] = in.readVLong(); + offsets = new BigLongArray(size); + for (int i = 0; i < size; i++) { + offsets.set(i, in.readVLong()); } this.currentOffsetIdx = -1; this.currentOffset = 0; @@ -137,7 +138,7 @@ public abstract class CompressedIndexInput extends IndexInput { @Override public void seek(long pos) throws IOException { int idx = (int) (pos / uncompressed.length); - if (idx >= offsets.length) { + if (idx >= offsets.size) { // set the next "readyBuffer" to EOF currentOffsetIdx = idx; position = 0; @@ -146,7 +147,7 @@ public abstract class CompressedIndexInput extends IndexInput { } // TODO: optimize so we won't have to readyBuffer on seek, can keep the position around, and set it on readyBuffer in this case - long pointer = offsets[idx]; + long pointer = offsets.get(idx); if (pointer != currentOffset) { in.seek(pointer); position = 0; @@ -182,7 +183,7 @@ public abstract class CompressedIndexInput extends IndexInput { return false; } // we reached the end... - if (currentOffsetIdx + 1 >= offsets.length) { + if (currentOffsetIdx + 1 >= offsets.size) { return false; } valid = uncompress(in, uncompressed); @@ -190,7 +191,7 @@ public abstract class CompressedIndexInput extends IndexInput { return false; } currentOffsetIdx++; - currentOffset = offsets[currentOffsetIdx]; + currentOffset = offsets.get(currentOffsetIdx); currentOffsetFilePointer = currentOffset - headerLength; position = 0; return (position < valid); diff --git a/src/main/java/org/elasticsearch/common/compress/CompressedIndexOutput.java b/src/main/java/org/elasticsearch/common/compress/CompressedIndexOutput.java index 01e47d0854a..5ae4c83d2b4 100644 --- a/src/main/java/org/elasticsearch/common/compress/CompressedIndexOutput.java +++ b/src/main/java/org/elasticsearch/common/compress/CompressedIndexOutput.java @@ -40,6 +40,7 @@ public abstract class CompressedIndexOutput extends IndexOutput { private boolean closed; private final long metaDataPointer; + // need to have a growing segment long array list here... private TLongArrayList offsets = new TLongArrayList(); public CompressedIndexOutput(IndexOutput out) throws IOException { diff --git a/src/main/java/org/elasticsearch/common/util/BigLongArray.java b/src/main/java/org/elasticsearch/common/util/BigLongArray.java new file mode 100644 index 00000000000..b4d1a8f42cf --- /dev/null +++ b/src/main/java/org/elasticsearch/common/util/BigLongArray.java @@ -0,0 +1,75 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.common.util; + +/** + * A GC friendly long[]. + * Allocating large arrays (that are not short-lived) generate fragmentation + * in old-gen space. This breaks such large long array into fixed size pages + * to avoid that problem. + */ +public class BigLongArray { + + private static final int DEFAULT_PAGE_SIZE = 4096; + + private final long[][] pages; + public final int size; + + private final int pageSize; + private final int pageCount; + + public BigLongArray(int size) { + this(size, DEFAULT_PAGE_SIZE); + } + + public BigLongArray(int size, int pageSize) { + this.size = size; + this.pageSize = pageSize; + + int lastPageSize = size % pageSize; + int fullPageCount = size / pageSize; + pageCount = fullPageCount + (lastPageSize == 0 ? 0 : 1); + pages = new long[pageCount][]; + + for (int i = 0; i < fullPageCount; ++i) + pages[i] = new long[pageSize]; + + if (lastPageSize != 0) + pages[pages.length - 1] = new long[lastPageSize]; + } + + public void set(int idx, long value) { + if (idx < 0 || idx > size) + throw new IndexOutOfBoundsException(String.format("%d is not whithin [0, %d)", idx, size)); + + int page = idx / pageSize; + int pageIdx = idx % pageSize; + pages[page][pageIdx] = value; + } + + public long get(int idx) { + if (idx < 0 || idx > size) + throw new IndexOutOfBoundsException(String.format("%d is not whithin [0, %d)", idx, size)); + + int page = idx / pageSize; + int pageIdx = idx % pageSize; + return pages[page][pageIdx]; + } +}