better compressed input offset data structure

This commit is contained in:
Shay Banon 2012-06-20 16:01:09 +02:00
parent b009c9c652
commit 9e6cfa77a5
3 changed files with 85 additions and 8 deletions

View File

@ -20,6 +20,7 @@
package org.elasticsearch.common.compress; package org.elasticsearch.common.compress;
import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexInput;
import org.elasticsearch.common.util.BigLongArray;
import java.io.EOFException; import java.io.EOFException;
import java.io.IOException; import java.io.IOException;
@ -32,7 +33,7 @@ public abstract class CompressedIndexInput extends IndexInput {
private int version; private int version;
private long uncompressedLength; private long uncompressedLength;
private long[] offsets; private BigLongArray offsets;
private boolean closed; private boolean closed;
@ -55,9 +56,9 @@ public abstract class CompressedIndexInput extends IndexInput {
in.seek(metaDataPosition); in.seek(metaDataPosition);
this.uncompressedLength = in.readVLong(); this.uncompressedLength = in.readVLong();
int size = in.readVInt(); int size = in.readVInt();
offsets = new long[size]; offsets = new BigLongArray(size);
for (int i = 0; i < offsets.length; i++) { for (int i = 0; i < size; i++) {
offsets[i] = in.readVLong(); offsets.set(i, in.readVLong());
} }
this.currentOffsetIdx = -1; this.currentOffsetIdx = -1;
this.currentOffset = 0; this.currentOffset = 0;
@ -137,7 +138,7 @@ public abstract class CompressedIndexInput extends IndexInput {
@Override @Override
public void seek(long pos) throws IOException { public void seek(long pos) throws IOException {
int idx = (int) (pos / uncompressed.length); int idx = (int) (pos / uncompressed.length);
if (idx >= offsets.length) { if (idx >= offsets.size) {
// set the next "readyBuffer" to EOF // set the next "readyBuffer" to EOF
currentOffsetIdx = idx; currentOffsetIdx = idx;
position = 0; position = 0;
@ -146,7 +147,7 @@ public abstract class CompressedIndexInput extends IndexInput {
} }
// TODO: optimize so we won't have to readyBuffer on seek, can keep the position around, and set it on readyBuffer in this case // TODO: optimize so we won't have to readyBuffer on seek, can keep the position around, and set it on readyBuffer in this case
long pointer = offsets[idx]; long pointer = offsets.get(idx);
if (pointer != currentOffset) { if (pointer != currentOffset) {
in.seek(pointer); in.seek(pointer);
position = 0; position = 0;
@ -182,7 +183,7 @@ public abstract class CompressedIndexInput extends IndexInput {
return false; return false;
} }
// we reached the end... // we reached the end...
if (currentOffsetIdx + 1 >= offsets.length) { if (currentOffsetIdx + 1 >= offsets.size) {
return false; return false;
} }
valid = uncompress(in, uncompressed); valid = uncompress(in, uncompressed);
@ -190,7 +191,7 @@ public abstract class CompressedIndexInput extends IndexInput {
return false; return false;
} }
currentOffsetIdx++; currentOffsetIdx++;
currentOffset = offsets[currentOffsetIdx]; currentOffset = offsets.get(currentOffsetIdx);
currentOffsetFilePointer = currentOffset - headerLength; currentOffsetFilePointer = currentOffset - headerLength;
position = 0; position = 0;
return (position < valid); return (position < valid);

View File

@ -40,6 +40,7 @@ public abstract class CompressedIndexOutput extends IndexOutput {
private boolean closed; private boolean closed;
private final long metaDataPointer; private final long metaDataPointer;
// need to have a growing segment long array list here...
private TLongArrayList offsets = new TLongArrayList(); private TLongArrayList offsets = new TLongArrayList();
public CompressedIndexOutput(IndexOutput out) throws IOException { public CompressedIndexOutput(IndexOutput out) throws IOException {

View File

@ -0,0 +1,75 @@
/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.common.util;
/**
* A GC friendly long[].
* Allocating large arrays (that are not short-lived) generate fragmentation
* in old-gen space. This breaks such large long array into fixed size pages
* to avoid that problem.
*/
public class BigLongArray {
private static final int DEFAULT_PAGE_SIZE = 4096;
private final long[][] pages;
public final int size;
private final int pageSize;
private final int pageCount;
public BigLongArray(int size) {
this(size, DEFAULT_PAGE_SIZE);
}
public BigLongArray(int size, int pageSize) {
this.size = size;
this.pageSize = pageSize;
int lastPageSize = size % pageSize;
int fullPageCount = size / pageSize;
pageCount = fullPageCount + (lastPageSize == 0 ? 0 : 1);
pages = new long[pageCount][];
for (int i = 0; i < fullPageCount; ++i)
pages[i] = new long[pageSize];
if (lastPageSize != 0)
pages[pages.length - 1] = new long[lastPageSize];
}
public void set(int idx, long value) {
if (idx < 0 || idx > size)
throw new IndexOutOfBoundsException(String.format("%d is not whithin [0, %d)", idx, size));
int page = idx / pageSize;
int pageIdx = idx % pageSize;
pages[page][pageIdx] = value;
}
public long get(int idx) {
if (idx < 0 || idx > size)
throw new IndexOutOfBoundsException(String.format("%d is not whithin [0, %d)", idx, size));
int page = idx / pageSize;
int pageIdx = idx % pageSize;
return pages[page][pageIdx];
}
}