From 8eaed3c8659e4e31eefa8a5e7bf46e62246443b8 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Wed, 8 Aug 2012 14:15:22 +0000 Subject: [PATCH] LUCENE-3892: Ability to select the right format based on an `acceptableOverheadRatio`. The `acceptableOverheadRatio` is currenlty configurable through BlockPackedPostingsWriter's constructor and defaults to PackedInts.DEFAULT. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/pforcodec_3892@1370781 13f79535-47bb-0310-9956-ffa450edef68 --- .../BlockPackedPostingsReader.java | 66 ++++---- .../BlockPackedPostingsWriter.java | 40 +++-- .../lucene/codecs/blockpacked/ForUtil.java | 143 ++++++++++++++---- .../lucene/util/packed/BulkOperation.java | 86 +++++------ .../lucene/util/packed/gen_BulkOperation.py | 6 +- .../codecs/blockpacked/TestForUtil.java | 94 ++++++++++++ .../lucene/util/packed/TestPackedInts.java | 8 +- 7 files changed, 323 insertions(+), 120 deletions(-) create mode 100644 lucene/core/src/test/org/apache/lucene/codecs/blockpacked/TestForUtil.java diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blockpacked/BlockPackedPostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/blockpacked/BlockPackedPostingsReader.java index 30cc056eb17..2abf33ce601 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blockpacked/BlockPackedPostingsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blockpacked/BlockPackedPostingsReader.java @@ -18,6 +18,8 @@ package org.apache.lucene.codecs.blockpacked; */ import static org.apache.lucene.codecs.blockpacked.BlockPackedPostingsFormat.BLOCK_SIZE; +import static org.apache.lucene.codecs.blockpacked.ForUtil.MIN_DATA_SIZE; +import static org.apache.lucene.codecs.blockpacked.ForUtil.MIN_ENCODED_SIZE; import java.io.IOException; import java.util.Arrays; @@ -25,7 +27,6 @@ import java.util.Arrays; import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.PostingsReaderBase; -import org.apache.lucene.codecs.blockpacked.BlockPackedSkipReader; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.FieldInfo; @@ -57,6 +58,8 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { private final IndexInput posIn; private final IndexInput payIn; + private final ForUtil forUtil; + public static boolean DEBUG = false; // nocommit @@ -75,6 +78,7 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { BlockPackedPostingsWriter.DOC_CODEC, BlockPackedPostingsWriter.VERSION_START, BlockPackedPostingsWriter.VERSION_START); + forUtil = new ForUtil(docIn); if (fieldInfos.hasProx()) { posIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, BlockPackedPostingsFormat.POS_EXTENSION), @@ -294,8 +298,8 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { final class BlockDocsEnum extends DocsEnum { private final byte[] encoded; - private final long[] docDeltaBuffer = new long[BLOCK_SIZE]; - private final long[] freqBuffer = new long[BLOCK_SIZE]; + private final long[] docDeltaBuffer = new long[MIN_DATA_SIZE]; + private final long[] freqBuffer = new long[MIN_DATA_SIZE]; private int docBufferUpto; @@ -333,7 +337,7 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { indexHasPos = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; indexHasPayloads = fieldInfo.hasPayloads(); - encoded = new byte[BLOCK_SIZE * 4]; + encoded = new byte[MIN_ENCODED_SIZE]; } public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { @@ -383,13 +387,13 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { if (DEBUG) { System.out.println(" fill doc block from fp=" + docIn.getFilePointer()); } - ForUtil.readBlock(docIn, encoded, docDeltaBuffer); + forUtil.readBlock(docIn, encoded, docDeltaBuffer); if (indexHasFreq) { if (DEBUG) { System.out.println(" fill freq block from fp=" + docIn.getFilePointer()); } - ForUtil.readBlock(docIn, encoded, freqBuffer); + forUtil.readBlock(docIn, encoded, freqBuffer); } } else { // Read vInts: @@ -540,9 +544,9 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { private final byte[] encoded; - private final long[] docDeltaBuffer = new long[BLOCK_SIZE]; - private final long[] freqBuffer = new long[BLOCK_SIZE]; - private final long[] posDeltaBuffer = new long[BLOCK_SIZE]; + private final long[] docDeltaBuffer = new long[MIN_DATA_SIZE]; + private final long[] freqBuffer = new long[MIN_DATA_SIZE]; + private final long[] posDeltaBuffer = new long[MIN_DATA_SIZE]; private int docBufferUpto; private int posBufferUpto; @@ -599,7 +603,7 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { this.startDocIn = BlockPackedPostingsReader.this.docIn; this.docIn = (IndexInput) startDocIn.clone(); this.posIn = (IndexInput) BlockPackedPostingsReader.this.posIn.clone(); - encoded = new byte[BLOCK_SIZE*4]; + encoded = new byte[MIN_ENCODED_SIZE]; indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; indexHasPayloads = fieldInfo.hasPayloads(); } @@ -657,11 +661,11 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { if (DEBUG) { System.out.println(" fill doc block from fp=" + docIn.getFilePointer()); } - ForUtil.readBlock(docIn, encoded, docDeltaBuffer); + forUtil.readBlock(docIn, encoded, docDeltaBuffer); if (DEBUG) { System.out.println(" fill freq block from fp=" + docIn.getFilePointer()); } - ForUtil.readBlock(docIn, encoded, freqBuffer); + forUtil.readBlock(docIn, encoded, freqBuffer); } else { // Read vInts: if (DEBUG) { @@ -704,7 +708,7 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { if (DEBUG) { System.out.println(" bulk pos block @ fp=" + posIn.getFilePointer()); } - ForUtil.readBlock(posIn, encoded, posDeltaBuffer); + forUtil.readBlock(posIn, encoded, posDeltaBuffer); } } @@ -873,7 +877,7 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { System.out.println(" skip whole block @ fp=" + posIn.getFilePointer()); } assert posIn.getFilePointer() != lastPosBlockFP; - ForUtil.skipBlock(posIn); + forUtil.skipBlock(posIn); toSkip -= BLOCK_SIZE; } refillPositions(); @@ -945,9 +949,9 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { private final byte[] encoded; - private final long[] docDeltaBuffer = new long[BLOCK_SIZE]; - private final long[] freqBuffer = new long[BLOCK_SIZE]; - private final long[] posDeltaBuffer = new long[BLOCK_SIZE]; + private final long[] docDeltaBuffer = new long[MIN_DATA_SIZE]; + private final long[] freqBuffer = new long[MIN_DATA_SIZE]; + private final long[] posDeltaBuffer = new long[MIN_DATA_SIZE]; private final long[] payloadLengthBuffer; private final long[] offsetStartDeltaBuffer; @@ -1023,11 +1027,11 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { this.docIn = (IndexInput) startDocIn.clone(); this.posIn = (IndexInput) BlockPackedPostingsReader.this.posIn.clone(); this.payIn = (IndexInput) BlockPackedPostingsReader.this.payIn.clone(); - encoded = new byte[BLOCK_SIZE*4]; + encoded = new byte[MIN_ENCODED_SIZE]; indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; if (indexHasOffsets) { - offsetStartDeltaBuffer = new long[BLOCK_SIZE]; - offsetLengthBuffer = new long[BLOCK_SIZE]; + offsetStartDeltaBuffer = new long[MIN_DATA_SIZE]; + offsetLengthBuffer = new long[MIN_DATA_SIZE]; } else { offsetStartDeltaBuffer = null; offsetLengthBuffer = null; @@ -1037,7 +1041,7 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { indexHasPayloads = fieldInfo.hasPayloads(); if (indexHasPayloads) { - payloadLengthBuffer = new long[BLOCK_SIZE]; + payloadLengthBuffer = new long[MIN_DATA_SIZE]; payloadBytes = new byte[128]; payload = new BytesRef(); } else { @@ -1102,11 +1106,11 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { if (DEBUG) { System.out.println(" fill doc block from fp=" + docIn.getFilePointer()); } - ForUtil.readBlock(docIn, encoded, docDeltaBuffer); + forUtil.readBlock(docIn, encoded, docDeltaBuffer); if (DEBUG) { System.out.println(" fill freq block from fp=" + docIn.getFilePointer()); } - ForUtil.readBlock(docIn, encoded, freqBuffer); + forUtil.readBlock(docIn, encoded, freqBuffer); } else { if (DEBUG) { System.out.println(" fill last vInt doc block from fp=" + docIn.getFilePointer()); @@ -1166,13 +1170,13 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { if (DEBUG) { System.out.println(" bulk pos block @ fp=" + posIn.getFilePointer()); } - ForUtil.readBlock(posIn, encoded, posDeltaBuffer); + forUtil.readBlock(posIn, encoded, posDeltaBuffer); if (indexHasPayloads) { if (DEBUG) { System.out.println(" bulk payload block @ pay.fp=" + payIn.getFilePointer()); } - ForUtil.readBlock(payIn, encoded, payloadLengthBuffer); + forUtil.readBlock(payIn, encoded, payloadLengthBuffer); int numBytes = payIn.readVInt(); if (DEBUG) { System.out.println(" " + numBytes + " payload bytes @ pay.fp=" + payIn.getFilePointer()); @@ -1188,8 +1192,8 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { if (DEBUG) { System.out.println(" bulk offset block @ pay.fp=" + payIn.getFilePointer()); } - ForUtil.readBlock(payIn, encoded, offsetStartDeltaBuffer); - ForUtil.readBlock(payIn, encoded, offsetLengthBuffer); + forUtil.readBlock(payIn, encoded, offsetStartDeltaBuffer); + forUtil.readBlock(payIn, encoded, offsetLengthBuffer); } } } @@ -1355,11 +1359,11 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { System.out.println(" skip whole block @ fp=" + posIn.getFilePointer()); } assert posIn.getFilePointer() != lastPosBlockFP; - ForUtil.skipBlock(posIn); + forUtil.skipBlock(posIn); if (indexHasPayloads) { // Skip payloadLength block: - ForUtil.skipBlock(payIn); + forUtil.skipBlock(payIn); // Skip payloadBytes block: int numBytes = payIn.readVInt(); @@ -1369,8 +1373,8 @@ public final class BlockPackedPostingsReader extends PostingsReaderBase { if (indexHasOffsets) { // Must load offset blocks merely to sum // up into lastStartOffset: - ForUtil.readBlock(payIn, encoded, offsetStartDeltaBuffer); - ForUtil.readBlock(payIn, encoded, offsetLengthBuffer); + forUtil.readBlock(payIn, encoded, offsetStartDeltaBuffer); + forUtil.readBlock(payIn, encoded, offsetLengthBuffer); for(int i=0;i>> %d)" %(byte_start, bit_start)) for b in xrange(byte_start + 1, byte_end): - f.write(" | (byte%d << %d)" %(b, 8 * (b - byte_start))) + f.write(" | (byte%d << %d)" %(b, 8 * (b - byte_start) - bit_start)) if bit_end == 7: - f.write(" | (byte%d << %d)" %(byte_end, 8 * (byte_end - byte_start))) + f.write(" | (byte%d << %d)" %(byte_end, 8 * (byte_end - byte_start) - bit_start)) else: - f.write(" | ((byte%d & %d) << %d)" %(byte_end, 2 ** (bit_end + 1) - 1, 8 * (byte_end - byte_start) + bpv - bit_end - 1)) + f.write(" | ((byte%d & %d) << %d)" %(byte_end, 2 ** (bit_end + 1) - 1, 8 * (byte_end - byte_start) - bit_start)) f.write(";\n") f.write(" }\n") f.write(" }\n\n") diff --git a/lucene/core/src/test/org/apache/lucene/codecs/blockpacked/TestForUtil.java b/lucene/core/src/test/org/apache/lucene/codecs/blockpacked/TestForUtil.java new file mode 100644 index 00000000000..a4cde9fc458 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/blockpacked/TestForUtil.java @@ -0,0 +1,94 @@ +package org.apache.lucene.codecs.blockpacked; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import static org.apache.lucene.codecs.blockpacked.BlockPackedPostingsFormat.BLOCK_SIZE; +import static org.apache.lucene.codecs.blockpacked.ForUtil.MIN_DATA_SIZE; +import static org.apache.lucene.codecs.blockpacked.ForUtil.MIN_ENCODED_SIZE; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.packed.PackedInts; + +import com.carrotsearch.randomizedtesting.generators.RandomInts; + +public class TestForUtil extends LuceneTestCase { + + public void testEncodeDecode() throws IOException { + final int iterations = RandomInts.randomIntBetween(random(), 1, 1000); + final float acceptableOverheadRatio = random().nextFloat(); + final long[] values = new long[iterations * BLOCK_SIZE + ForUtil.MIN_DATA_SIZE]; + for (int i = 0; i < iterations; ++i) { + final int bpv = random().nextInt(32); + if (bpv == 0) { + final int value = RandomInts.randomIntBetween(random(), 0, Integer.MAX_VALUE); + for (int j = 0; j < BLOCK_SIZE; ++j) { + values[i * BLOCK_SIZE + j] = value; + } + } else { + for (int j = 0; j < BLOCK_SIZE; ++j) { + values[i * BLOCK_SIZE + j] = RandomInts.randomIntBetween(random(), + 0, (int) PackedInts.maxValue(bpv)); + } + } + } + + final Directory d = new RAMDirectory(); + final long endPointer; + + { + // encode + IndexOutput out = d.createOutput("test.bin", IOContext.DEFAULT); + final ForUtil forUtil = new ForUtil(acceptableOverheadRatio, out); + + for (int i = 0; i < iterations; ++i) { + forUtil.writeBlock( + Arrays.copyOfRange(values, iterations * BLOCK_SIZE, values.length), + new byte[MIN_ENCODED_SIZE], out); + } + endPointer = out.getFilePointer(); + out.close(); + } + + { + // decode + IndexInput in = d.openInput("test.bin", IOContext.READONCE); + final ForUtil forUtil = new ForUtil(in); + for (int i = 0; i < iterations; ++i) { + if (random().nextBoolean()) { + forUtil.skipBlock(in); + continue; + } + final long[] restored = new long[MIN_DATA_SIZE]; + forUtil.readBlock(in, new byte[MIN_ENCODED_SIZE], restored); + assertArrayEquals(Arrays.copyOfRange(values, iterations * BLOCK_SIZE, (iterations + 1) * BLOCK_SIZE), + Arrays.copyOf(restored, BLOCK_SIZE)); + } + assertEquals(endPointer, in.getFilePointer()); + in.close(); + } + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java b/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java index ee1d24667f3..78cbcc15e4c 100644 --- a/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java +++ b/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java @@ -656,7 +656,7 @@ public class TestPackedInts extends LuceneTestCase { blocks[i] = random().nextLong(); if (format == PackedInts.Format.PACKED_SINGLE_BLOCK && 64 % bpv != 0) { // clear highest bits for packed - final int toClear = 64 - 64 % bpv; + final int toClear = 64 % bpv; blocks[i] = (blocks[i] << toClear) >>> toClear; } } @@ -664,6 +664,9 @@ public class TestPackedInts extends LuceneTestCase { // 2. decode final long[] values = new long[valuesOffset + iterations * valueCount]; decoder.decode(blocks, blocksOffset, values, valuesOffset, iterations); + for (long value : values) { + assertTrue(value <= PackedInts.maxValue(bpv)); + } // 3. re-encode final long[] blocks2 = new long[blocksOffset2 + blocksLen]; @@ -676,6 +679,9 @@ public class TestPackedInts extends LuceneTestCase { ByteBuffer.wrap(byteBlocks).asLongBuffer().put(blocks); final long[] values2 = new long[valuesOffset + iterations * valueCount]; decoder.decode(byteBlocks, blocksOffset * 8, values2, valuesOffset, iterations); + for (long value : values2) { + assertTrue(msg, value <= PackedInts.maxValue(bpv)); + } assertArrayEquals(msg, values, values2); // 5. byte[] encoding