mirror of https://github.com/apache/lucene.git
LUCENE-4583: IndexWriter no longer places a limit on length of DV binary fields (individual codecs still have their limits, including the default codec)
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1514669 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
fd9ae25cd1
commit
f7d2ac0b0d
|
@ -122,6 +122,11 @@ Bug Fixes
|
|||
boundary, made it into the top-N and went to the formatter.
|
||||
(Manuel Amoabeng, Michael McCandless, Robert Muir)
|
||||
|
||||
* LUCENE-4583: Indexing core no longer enforces a limit on maximum
|
||||
length binary doc values fields, but individual codecs (including
|
||||
the default one) have their own limits (David Smiley, Robert Muir,
|
||||
Mike McCandless)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-5094: Add ramBytesUsed() to MultiDocValues.OrdinalMap.
|
||||
|
|
|
@ -118,6 +118,11 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* {@code BYTES_VAR_DEREF BYTES_VAR_DEREF} it doesn't apply deduplication of the document values.
|
||||
* </li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* Limitations:
|
||||
* <ul>
|
||||
* <li> Binary doc values can be at most {@link #MAX_BINARY_FIELD_LENGTH} in length.
|
||||
* </ul>
|
||||
* @deprecated Only for reading old 4.0 and 4.1 segments
|
||||
*/
|
||||
@Deprecated
|
||||
|
@ -125,6 +130,9 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
// for back compat only!
|
||||
public class Lucene40DocValuesFormat extends DocValuesFormat {
|
||||
|
||||
/** Maximum length for each binary doc values field. */
|
||||
public static final int MAX_BINARY_FIELD_LENGTH = (1 << 15) - 2;
|
||||
|
||||
/** Sole constructor. */
|
||||
public Lucene40DocValuesFormat() {
|
||||
super("Lucene40");
|
||||
|
|
|
@ -36,14 +36,14 @@ import org.apache.lucene.util.IOUtils;
|
|||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.MathUtil;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.FST.INPUT_TYPE;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
import org.apache.lucene.util.packed.BlockPackedWriter;
|
||||
import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
import org.apache.lucene.util.packed.PackedInts.FormatAndBits;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
/**
|
||||
* Writer for {@link Lucene42DocValuesFormat}
|
||||
|
@ -216,6 +216,9 @@ class Lucene42DocValuesConsumer extends DocValuesConsumer {
|
|||
int maxLength = Integer.MIN_VALUE;
|
||||
final long startFP = data.getFilePointer();
|
||||
for(BytesRef v : values) {
|
||||
if (v.length > Lucene42DocValuesFormat.MAX_BINARY_FIELD_LENGTH) {
|
||||
throw new IllegalArgumentException("DocValuesField \"" + field.name + "\" is too large, must be <= " + Lucene42DocValuesFormat.MAX_BINARY_FIELD_LENGTH);
|
||||
}
|
||||
minLength = Math.min(minLength, v.length);
|
||||
maxLength = Math.max(maxLength, v.length);
|
||||
data.writeBytes(v.bytes, v.offset, v.length);
|
||||
|
|
|
@ -118,8 +118,17 @@ import org.apache.lucene.util.packed.BlockPackedWriter;
|
|||
* <p>SortedSet entries store the list of ordinals in their BinaryData as a
|
||||
* sequences of increasing {@link DataOutput#writeVLong vLong}s, delta-encoded.</p>
|
||||
* </ol>
|
||||
* <p>
|
||||
* Limitations:
|
||||
* <ul>
|
||||
* <li> Binary doc values can be at most {@link #MAX_BINARY_FIELD_LENGTH} in length.
|
||||
* </ul>
|
||||
*/
|
||||
public final class Lucene42DocValuesFormat extends DocValuesFormat {
|
||||
|
||||
/** Maximum length for each binary doc values field. */
|
||||
public static final int MAX_BINARY_FIELD_LENGTH = (1 << 15) - 2;
|
||||
|
||||
final float acceptableOverheadRatio;
|
||||
|
||||
/**
|
||||
|
|
|
@ -22,29 +22,43 @@ import java.util.Iterator;
|
|||
import java.util.NoSuchElementException;
|
||||
|
||||
import org.apache.lucene.codecs.DocValuesConsumer;
|
||||
import org.apache.lucene.util.ByteBlockPool.DirectTrackingAllocator;
|
||||
import org.apache.lucene.util.ByteBlockPool;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.Counter;
|
||||
import org.apache.lucene.util.PagedBytes;
|
||||
import org.apache.lucene.util.packed.AppendingDeltaPackedLongBuffer;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
||||
import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
|
||||
|
||||
|
||||
/** Buffers up pending byte[] per doc, then flushes when
|
||||
* segment flushes. */
|
||||
class BinaryDocValuesWriter extends DocValuesWriter {
|
||||
|
||||
private final ByteBlockPool pool;
|
||||
/** Maximum length for a binary field; we set this to "a
|
||||
* bit" below Integer.MAX_VALUE because the exact max
|
||||
* allowed byte[] is JVM dependent, so we want to avoid
|
||||
* a case where a large value worked in one JVM but
|
||||
* failed later at search time with a different JVM. */
|
||||
private static final int MAX_LENGTH = Integer.MAX_VALUE-256;
|
||||
|
||||
// 32 KB block sizes for PagedBytes storage:
|
||||
private final static int BLOCK_BITS = 15;
|
||||
|
||||
private final PagedBytes bytes;
|
||||
private final DataOutput bytesOut;
|
||||
|
||||
private final Counter iwBytesUsed;
|
||||
private final AppendingDeltaPackedLongBuffer lengths;
|
||||
private final FieldInfo fieldInfo;
|
||||
private int addedValues = 0;
|
||||
private int addedValues;
|
||||
private long bytesUsed;
|
||||
|
||||
public BinaryDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
|
||||
this.fieldInfo = fieldInfo;
|
||||
this.pool = new ByteBlockPool(new DirectTrackingAllocator(iwBytesUsed));
|
||||
this.bytes = new PagedBytes(BLOCK_BITS);
|
||||
this.bytesOut = bytes.getDataOutput();
|
||||
this.lengths = new AppendingDeltaPackedLongBuffer(PackedInts.COMPACT);
|
||||
this.iwBytesUsed = iwBytesUsed;
|
||||
}
|
||||
|
||||
public void addValue(int docID, BytesRef value) {
|
||||
|
@ -54,8 +68,8 @@ class BinaryDocValuesWriter extends DocValuesWriter {
|
|||
if (value == null) {
|
||||
throw new IllegalArgumentException("field=\"" + fieldInfo.name + "\": null value not allowed");
|
||||
}
|
||||
if (value.length > (BYTE_BLOCK_SIZE - 2)) {
|
||||
throw new IllegalArgumentException("DocValuesField \"" + fieldInfo.name + "\" is too large, must be <= " + (BYTE_BLOCK_SIZE - 2));
|
||||
if (value.length > MAX_LENGTH) {
|
||||
throw new IllegalArgumentException("DocValuesField \"" + fieldInfo.name + "\" is too large, must be <= " + MAX_LENGTH);
|
||||
}
|
||||
|
||||
// Fill in any holes:
|
||||
|
@ -65,7 +79,19 @@ class BinaryDocValuesWriter extends DocValuesWriter {
|
|||
}
|
||||
addedValues++;
|
||||
lengths.add(value.length);
|
||||
pool.append(value);
|
||||
try {
|
||||
bytesOut.writeBytes(value.bytes, value.offset, value.length);
|
||||
} catch (IOException ioe) {
|
||||
// Should never happen!
|
||||
throw new RuntimeException(ioe);
|
||||
}
|
||||
updateBytesUsed();
|
||||
}
|
||||
|
||||
private void updateBytesUsed() {
|
||||
final long newBytesUsed = lengths.ramBytesUsed() + bytes.ramBytesUsed();
|
||||
iwBytesUsed.addAndGet(newBytesUsed - bytesUsed);
|
||||
bytesUsed = newBytesUsed;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -75,6 +101,7 @@ class BinaryDocValuesWriter extends DocValuesWriter {
|
|||
@Override
|
||||
public void flush(SegmentWriteState state, DocValuesConsumer dvConsumer) throws IOException {
|
||||
final int maxDoc = state.segmentInfo.getDocCount();
|
||||
bytes.freeze(false);
|
||||
dvConsumer.addBinaryField(fieldInfo,
|
||||
new Iterable<BytesRef>() {
|
||||
@Override
|
||||
|
@ -92,10 +119,10 @@ class BinaryDocValuesWriter extends DocValuesWriter {
|
|||
private class BytesIterator implements Iterator<BytesRef> {
|
||||
final BytesRef value = new BytesRef();
|
||||
final AppendingDeltaPackedLongBuffer.Iterator lengthsIterator = lengths.iterator();
|
||||
final DataInput bytesIterator = bytes.getDataInput();
|
||||
final int size = (int) lengths.size();
|
||||
final int maxDoc;
|
||||
int upto;
|
||||
long byteOffset;
|
||||
|
||||
BytesIterator(int maxDoc) {
|
||||
this.maxDoc = maxDoc;
|
||||
|
@ -115,8 +142,12 @@ class BinaryDocValuesWriter extends DocValuesWriter {
|
|||
int length = (int) lengthsIterator.next();
|
||||
value.grow(length);
|
||||
value.length = length;
|
||||
pool.readBytes(byteOffset, value.bytes, value.offset, value.length);
|
||||
byteOffset += length;
|
||||
try {
|
||||
bytesIterator.readBytes(value.bytes, value.offset, value.length);
|
||||
} catch (IOException ioe) {
|
||||
// Should never happen!
|
||||
throw new RuntimeException(ioe);
|
||||
}
|
||||
} else {
|
||||
// This is to handle last N documents not having
|
||||
// this DV field in the end of the segment:
|
||||
|
|
|
@ -92,21 +92,22 @@ public final class FieldInfo {
|
|||
*/
|
||||
NUMERIC,
|
||||
/**
|
||||
* A per-document byte[].
|
||||
* A per-document byte[]. Values may be larger than
|
||||
* 32766 bytes, but different codecs may enforce their own limits.
|
||||
*/
|
||||
BINARY,
|
||||
/**
|
||||
* A pre-sorted byte[]. Fields with this type only store distinct byte values
|
||||
* and store an additional offset pointer per document to dereference the shared
|
||||
* byte[]. The stored byte[] is presorted and allows access via document id,
|
||||
* ordinal and by-value.
|
||||
* ordinal and by-value. Values must be <= 32766 bytes.
|
||||
*/
|
||||
SORTED,
|
||||
/**
|
||||
* A pre-sorted Set<byte[]>. Fields with this type only store distinct byte values
|
||||
* and store additional offset pointers per document to dereference the shared
|
||||
* byte[]s. The stored byte[] is presorted and allows access via document id,
|
||||
* ordinal and by-value.
|
||||
* ordinal and by-value. Values must be <= 32766 bytes.
|
||||
*/
|
||||
SORTED_SET
|
||||
};
|
||||
|
|
|
@ -21,6 +21,8 @@ import java.io.IOException;
|
|||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
||||
/** Represents a logical byte[] as a series of pages. You
|
||||
|
@ -34,6 +36,7 @@ import org.apache.lucene.store.IndexInput;
|
|||
// other "shift/mask big arrays". there are too many of these classes!
|
||||
public final class PagedBytes {
|
||||
private final List<byte[]> blocks = new ArrayList<byte[]>();
|
||||
// TODO: these are unused?
|
||||
private final List<Integer> blockEnd = new ArrayList<Integer>();
|
||||
private final int blockSize;
|
||||
private final int blockBits;
|
||||
|
@ -42,6 +45,7 @@ public final class PagedBytes {
|
|||
private boolean frozen;
|
||||
private int upto;
|
||||
private byte[] currentBlock;
|
||||
private final long bytesUsedPerBlock;
|
||||
|
||||
private static final byte[] EMPTY_BYTES = new byte[0];
|
||||
|
||||
|
@ -75,13 +79,13 @@ public final class PagedBytes {
|
|||
* given length. Iff the slice spans across a block border this method will
|
||||
* allocate sufficient resources and copy the paged data.
|
||||
* <p>
|
||||
* Slices spanning more than one block are not supported.
|
||||
* Slices spanning more than two blocks are not supported.
|
||||
* </p>
|
||||
* @lucene.internal
|
||||
**/
|
||||
public void fillSlice(BytesRef b, long start, int length) {
|
||||
assert length >= 0: "length=" + length;
|
||||
assert length <= blockSize+1;
|
||||
assert length <= blockSize+1: "length=" + length;
|
||||
final int index = (int) (start >> blockBits);
|
||||
final int offset = (int) (start & blockMask);
|
||||
b.length = length;
|
||||
|
@ -132,6 +136,7 @@ public final class PagedBytes {
|
|||
this.blockBits = blockBits;
|
||||
blockMask = blockSize-1;
|
||||
upto = blockSize;
|
||||
bytesUsedPerBlock = blockSize + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + RamUsageEstimator.NUM_BYTES_OBJECT_REF;
|
||||
}
|
||||
|
||||
/** Read this many bytes from in */
|
||||
|
@ -216,6 +221,11 @@ public final class PagedBytes {
|
|||
}
|
||||
}
|
||||
|
||||
/** Return approx RAM usage in bytes. */
|
||||
public long ramBytesUsed() {
|
||||
return (blocks.size() + (currentBlock != null ? 1 : 0)) * bytesUsedPerBlock;
|
||||
}
|
||||
|
||||
/** Copy bytes in, writing the length as a 1 or 2 byte
|
||||
* vInt prefix. */
|
||||
// TODO: this really needs to be refactored into fieldcacheimpl!
|
||||
|
@ -249,4 +259,148 @@ public final class PagedBytes {
|
|||
|
||||
return pointer;
|
||||
}
|
||||
|
||||
public final class PagedBytesDataInput extends DataInput {
|
||||
private int currentBlockIndex;
|
||||
private int currentBlockUpto;
|
||||
private byte[] currentBlock;
|
||||
|
||||
PagedBytesDataInput() {
|
||||
currentBlock = blocks.get(0);
|
||||
}
|
||||
|
||||
@Override
|
||||
public PagedBytesDataInput clone() {
|
||||
PagedBytesDataInput clone = getDataInput();
|
||||
clone.setPosition(getPosition());
|
||||
return clone;
|
||||
}
|
||||
|
||||
/** Returns the current byte position. */
|
||||
public long getPosition() {
|
||||
return (long) currentBlockIndex * blockSize + currentBlockUpto;
|
||||
}
|
||||
|
||||
/** Seek to a position previously obtained from
|
||||
* {@link #getPosition}. */
|
||||
public void setPosition(long pos) {
|
||||
currentBlockIndex = (int) (pos >> blockBits);
|
||||
currentBlock = blocks.get(currentBlockIndex);
|
||||
currentBlockUpto = (int) (pos & blockMask);
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte readByte() {
|
||||
if (currentBlockUpto == blockSize) {
|
||||
nextBlock();
|
||||
}
|
||||
return currentBlock[currentBlockUpto++];
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readBytes(byte[] b, int offset, int len) {
|
||||
assert b.length >= offset + len;
|
||||
final int offsetEnd = offset + len;
|
||||
while (true) {
|
||||
final int blockLeft = blockSize - currentBlockUpto;
|
||||
final int left = offsetEnd - offset;
|
||||
if (blockLeft < left) {
|
||||
System.arraycopy(currentBlock, currentBlockUpto,
|
||||
b, offset,
|
||||
blockLeft);
|
||||
nextBlock();
|
||||
offset += blockLeft;
|
||||
} else {
|
||||
// Last block
|
||||
System.arraycopy(currentBlock, currentBlockUpto,
|
||||
b, offset,
|
||||
left);
|
||||
currentBlockUpto += left;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void nextBlock() {
|
||||
currentBlockIndex++;
|
||||
currentBlockUpto = 0;
|
||||
currentBlock = blocks.get(currentBlockIndex);
|
||||
}
|
||||
}
|
||||
|
||||
public final class PagedBytesDataOutput extends DataOutput {
|
||||
@Override
|
||||
public void writeByte(byte b) {
|
||||
if (upto == blockSize) {
|
||||
if (currentBlock != null) {
|
||||
blocks.add(currentBlock);
|
||||
blockEnd.add(upto);
|
||||
}
|
||||
currentBlock = new byte[blockSize];
|
||||
upto = 0;
|
||||
}
|
||||
currentBlock[upto++] = b;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeBytes(byte[] b, int offset, int length) {
|
||||
assert b.length >= offset + length;
|
||||
if (length == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (upto == blockSize) {
|
||||
if (currentBlock != null) {
|
||||
blocks.add(currentBlock);
|
||||
blockEnd.add(upto);
|
||||
}
|
||||
currentBlock = new byte[blockSize];
|
||||
upto = 0;
|
||||
}
|
||||
|
||||
final int offsetEnd = offset + length;
|
||||
while(true) {
|
||||
final int left = offsetEnd - offset;
|
||||
final int blockLeft = blockSize - upto;
|
||||
if (blockLeft < left) {
|
||||
System.arraycopy(b, offset, currentBlock, upto, blockLeft);
|
||||
blocks.add(currentBlock);
|
||||
blockEnd.add(blockSize);
|
||||
currentBlock = new byte[blockSize];
|
||||
upto = 0;
|
||||
offset += blockLeft;
|
||||
} else {
|
||||
// Last block
|
||||
System.arraycopy(b, offset, currentBlock, upto, left);
|
||||
upto += left;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Return the current byte position. */
|
||||
public long getPosition() {
|
||||
return getPointer();
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns a DataInput to read values from this
|
||||
* PagedBytes instance. */
|
||||
public PagedBytesDataInput getDataInput() {
|
||||
if (!frozen) {
|
||||
throw new IllegalStateException("must call freeze() before getDataInput");
|
||||
}
|
||||
return new PagedBytesDataInput();
|
||||
}
|
||||
|
||||
/** Returns a DataOutput that you may use to write into
|
||||
* this PagedBytes instance. If you do this, you should
|
||||
* not call the other writing methods (eg, copy);
|
||||
* results are undefined. */
|
||||
public PagedBytesDataOutput getDataOutput() {
|
||||
if (frozen) {
|
||||
throw new IllegalStateException("cannot get DataOutput after freeze()");
|
||||
}
|
||||
return new PagedBytesDataOutput();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -31,4 +31,10 @@ public class TestLucene40DocValuesFormat extends BaseDocValuesFormatTestCase {
|
|||
return codec;
|
||||
}
|
||||
|
||||
// LUCENE-4583: This codec should throw IAE on huge binary values:
|
||||
@Override
|
||||
protected boolean codecAcceptsHugeBinaryValues(String field) {
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -30,4 +30,9 @@ public class TestLucene42DocValuesFormat extends BaseCompressingDocValuesFormatT
|
|||
protected Codec getCodec() {
|
||||
return codec;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean codecAcceptsHugeBinaryValues(String field) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -46,6 +46,7 @@ import org.apache.lucene.search.TermQuery;
|
|||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
/**
|
||||
* Basic tests of PerFieldDocValuesFormat
|
||||
|
@ -64,6 +65,11 @@ public class TestPerFieldDocValuesFormat extends BaseDocValuesFormatTestCase {
|
|||
return codec;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean codecAcceptsHugeBinaryValues(String field) {
|
||||
return _TestUtil.fieldSupportsHugeBinaryDocValues(field);
|
||||
}
|
||||
|
||||
// just a simple trivial test
|
||||
// TODO: we should come up with a test that somehow checks that segment suffix
|
||||
// is respected by all codec apis (not just docvalues and postings)
|
||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.index;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
/** Tests the codec configuration defined by LuceneTestCase randomly
|
||||
* (typically a mix across different fields).
|
||||
|
@ -28,4 +29,9 @@ public class TestDocValuesFormat extends BaseDocValuesFormatTestCase {
|
|||
protected Codec getCodec() {
|
||||
return Codec.getDefault();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean codecAcceptsHugeBinaryValues(String field) {
|
||||
return _TestUtil.fieldSupportsHugeBinaryDocValues(field);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -327,30 +327,6 @@ public class TestDocValuesIndexing extends LuceneTestCase {
|
|||
directory.close();
|
||||
}
|
||||
|
||||
public void testTooLargeBytes() throws IOException {
|
||||
Analyzer analyzer = new MockAnalyzer(random());
|
||||
|
||||
Directory directory = newDirectory();
|
||||
// we don't use RandomIndexWriter because it might add more docvalues than we expect !!!!1
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
IndexWriter iwriter = new IndexWriter(directory, iwc);
|
||||
Document doc = new Document();
|
||||
byte bytes[] = new byte[100000];
|
||||
BytesRef b = new BytesRef(bytes);
|
||||
random().nextBytes(bytes);
|
||||
doc.add(new BinaryDocValuesField("dv", b));
|
||||
try {
|
||||
iwriter.addDocument(doc);
|
||||
fail("did not get expected exception");
|
||||
} catch (IllegalArgumentException expected) {
|
||||
// expected
|
||||
}
|
||||
iwriter.close();
|
||||
|
||||
directory.close();
|
||||
}
|
||||
|
||||
public void testTooLargeSortedBytes() throws IOException {
|
||||
Analyzer analyzer = new MockAnalyzer(random());
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.util.*;
|
|||
|
||||
import org.apache.lucene.store.BaseDirectoryWrapper;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
|
@ -30,6 +31,9 @@ import org.junit.Ignore;
|
|||
|
||||
public class TestPagedBytes extends LuceneTestCase {
|
||||
|
||||
// Writes random byte/s to "normal" file in dir, then
|
||||
// copies into PagedBytes and verifies with
|
||||
// PagedBytes.Reader:
|
||||
public void testDataInputOutput() throws Exception {
|
||||
Random random = random();
|
||||
for(int iter=0;iter<5*RANDOM_MULTIPLIER;iter++) {
|
||||
|
@ -90,6 +94,60 @@ public class TestPagedBytes extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
// Writes random byte/s into PagedBytes via
|
||||
// .getDataOutput(), then verifies with
|
||||
// PagedBytes.getDataInput():
|
||||
public void testDataInputOutput2() throws Exception {
|
||||
Random random = random();
|
||||
for(int iter=0;iter<5*RANDOM_MULTIPLIER;iter++) {
|
||||
final int blockBits = _TestUtil.nextInt(random, 1, 20);
|
||||
final int blockSize = 1 << blockBits;
|
||||
final PagedBytes p = new PagedBytes(blockBits);
|
||||
final DataOutput out = p.getDataOutput();
|
||||
final int numBytes = random().nextInt(10000000);
|
||||
|
||||
final byte[] answer = new byte[numBytes];
|
||||
random().nextBytes(answer);
|
||||
int written = 0;
|
||||
while(written < numBytes) {
|
||||
if (random().nextInt(10) == 7) {
|
||||
out.writeByte(answer[written++]);
|
||||
} else {
|
||||
int chunk = Math.min(random().nextInt(1000), numBytes - written);
|
||||
out.writeBytes(answer, written, chunk);
|
||||
written += chunk;
|
||||
}
|
||||
}
|
||||
|
||||
final PagedBytes.Reader reader = p.freeze(random.nextBoolean());
|
||||
|
||||
final DataInput in = p.getDataInput();
|
||||
|
||||
final byte[] verify = new byte[numBytes];
|
||||
int read = 0;
|
||||
while(read < numBytes) {
|
||||
if (random().nextInt(10) == 7) {
|
||||
verify[read++] = in.readByte();
|
||||
} else {
|
||||
int chunk = Math.min(random().nextInt(1000), numBytes - read);
|
||||
in.readBytes(verify, read, chunk);
|
||||
read += chunk;
|
||||
}
|
||||
}
|
||||
assertTrue(Arrays.equals(answer, verify));
|
||||
|
||||
final BytesRef slice = new BytesRef();
|
||||
for(int iter2=0;iter2<100;iter2++) {
|
||||
final int pos = random.nextInt(numBytes-1);
|
||||
final int len = random.nextInt(Math.min(blockSize+1, numBytes - pos));
|
||||
reader.fillSlice(slice, pos, len);
|
||||
for(int byteUpto=0;byteUpto<len;byteUpto++) {
|
||||
assertEquals(answer[pos + byteUpto], slice.bytes[slice.offset + byteUpto]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Ignore // memory hole
|
||||
public void testOverflow() throws IOException {
|
||||
BaseDirectoryWrapper dir = newFSDirectory(_TestUtil.getTempDir("testOverflow"));
|
||||
|
@ -126,4 +184,5 @@ public class TestPagedBytes extends LuceneTestCase {
|
|||
in.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -22,13 +22,16 @@ import java.io.IOException;
|
|||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.facet.FacetTestCase;
|
||||
import org.apache.lucene.facet.FacetTestUtils;
|
||||
import org.apache.lucene.facet.codecs.facet42.Facet42Codec;
|
||||
import org.apache.lucene.facet.index.FacetFields;
|
||||
import org.apache.lucene.facet.params.CategoryListParams;
|
||||
import org.apache.lucene.facet.params.FacetIndexingParams;
|
||||
|
@ -48,6 +51,8 @@ import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
|
|||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
public class TestDemoFacets extends FacetTestCase {
|
||||
|
||||
|
@ -248,4 +253,60 @@ public class TestDemoFacets extends FacetTestCase {
|
|||
dir.close();
|
||||
taxoDir.close();
|
||||
}
|
||||
|
||||
// LUCENE-4583: make sure if we require > 32 KB for one
|
||||
// document, we don't hit exc when using Facet42DocValuesFormat
|
||||
public void testManyFacetsInOneDocument() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
Directory taxoDir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
|
||||
iwc.setCodec(new Facet42Codec());
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
|
||||
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE);
|
||||
|
||||
FacetFields facetFields = new FacetFields(taxoWriter);
|
||||
|
||||
int numLabels = _TestUtil.nextInt(random(), 40000, 100000);
|
||||
|
||||
Document doc = new Document();
|
||||
doc.add(newTextField("field", "text", Field.Store.NO));
|
||||
List<CategoryPath> paths = new ArrayList<CategoryPath>();
|
||||
for(int i=0;i<numLabels;i++) {
|
||||
paths.add(new CategoryPath("dim", "" + i));
|
||||
}
|
||||
facetFields.addFields(doc, paths);
|
||||
writer.addDocument(doc);
|
||||
|
||||
// NRT open
|
||||
IndexSearcher searcher = newSearcher(writer.getReader());
|
||||
writer.close();
|
||||
|
||||
// NRT open
|
||||
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter);
|
||||
taxoWriter.close();
|
||||
|
||||
FacetSearchParams fsp = new FacetSearchParams(new CountFacetRequest(new CategoryPath("dim"), Integer.MAX_VALUE));
|
||||
|
||||
// Aggregate the facet counts:
|
||||
FacetsCollector c = FacetsCollector.create(fsp, searcher.getIndexReader(), taxoReader);
|
||||
|
||||
// MatchAllDocsQuery is for "browsing" (counts facets
|
||||
// for all non-deleted docs in the index); normally
|
||||
// you'd use a "normal" query, and use MultiCollector to
|
||||
// wrap collecting the "normal" hits and also facets:
|
||||
searcher.search(new MatchAllDocsQuery(), c);
|
||||
List<FacetResult> results = c.getFacetResults();
|
||||
assertEquals(1, results.size());
|
||||
FacetResultNode root = results.get(0).getFacetResultNode();
|
||||
assertEquals(numLabels, root.subResults.size());
|
||||
Set<String> allLabels = new HashSet<String>();
|
||||
for(FacetResultNode childNode : root.subResults) {
|
||||
assertEquals(2, childNode.label.length);
|
||||
allLabels.add(childNode.label.components[1]);
|
||||
assertEquals(1, (int) childNode.value);
|
||||
}
|
||||
assertEquals(numLabels, allLabels.size());
|
||||
|
||||
IOUtils.close(searcher.getIndexReader(), taxoReader, dir, taxoDir);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -156,6 +156,9 @@ class Lucene40DocValuesWriter extends DocValuesConsumer {
|
|||
int minLength = Integer.MAX_VALUE;
|
||||
int maxLength = Integer.MIN_VALUE;
|
||||
for (BytesRef b : values) {
|
||||
if (b.length > Lucene40DocValuesFormat.MAX_BINARY_FIELD_LENGTH) {
|
||||
throw new IllegalArgumentException("DocValuesField \"" + field.name + "\" is too large, must be <= " + Lucene40DocValuesFormat.MAX_BINARY_FIELD_LENGTH);
|
||||
}
|
||||
minLength = Math.min(minLength, b.length);
|
||||
maxLength = Math.max(maxLength, b.length);
|
||||
if (uniqueValues != null) {
|
||||
|
|
|
@ -17,21 +17,20 @@ package org.apache.lucene.index;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.codecs.lucene42.Lucene42DocValuesFormat;
|
||||
import org.apache.lucene.document.BinaryDocValuesField;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
@ -57,6 +56,8 @@ import org.apache.lucene.util.BytesRefHash;
|
|||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS;
|
||||
|
||||
/**
|
||||
* Abstract class to do basic tests for a docvalues format.
|
||||
* NOTE: This test focuses on the docvalues impl, nothing else.
|
||||
|
@ -2198,4 +2199,172 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
// LUCENE-4853
|
||||
public void testHugeBinaryValues() throws Exception {
|
||||
Analyzer analyzer = new MockAnalyzer(random());
|
||||
// FSDirectory because SimpleText will consume gobbs of
|
||||
// space when storing big binary values:
|
||||
Directory d = newFSDirectory(_TestUtil.getTempDir("hugeBinaryValues"));
|
||||
boolean doFixed = random().nextBoolean();
|
||||
int numDocs;
|
||||
int fixedLength = 0;
|
||||
if (doFixed) {
|
||||
// Sometimes make all values fixed length since some
|
||||
// codecs have different code paths for this:
|
||||
numDocs = _TestUtil.nextInt(random(), 10, 20);
|
||||
fixedLength = _TestUtil.nextInt(random(), 65537, 256*1024);
|
||||
} else {
|
||||
numDocs = _TestUtil.nextInt(random(), 100, 200);
|
||||
}
|
||||
IndexWriter w = new IndexWriter(d, newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
|
||||
List<byte[]> docBytes = new ArrayList<byte[]>();
|
||||
long totalBytes = 0;
|
||||
for(int docID=0;docID<numDocs;docID++) {
|
||||
// we don't use RandomIndexWriter because it might add
|
||||
// more docvalues than we expect !!!!
|
||||
|
||||
// Must be > 64KB in size to ensure more than 2 pages in
|
||||
// PagedBytes would be needed:
|
||||
int numBytes;
|
||||
if (doFixed) {
|
||||
numBytes = fixedLength;
|
||||
} else if (docID == 0 || random().nextInt(5) == 3) {
|
||||
numBytes = _TestUtil.nextInt(random(), 65537, 3*1024*1024);
|
||||
} else {
|
||||
numBytes = _TestUtil.nextInt(random(), 1, 1024*1024);
|
||||
}
|
||||
totalBytes += numBytes;
|
||||
if (totalBytes > 5 * 1024*1024) {
|
||||
break;
|
||||
}
|
||||
byte[] bytes = new byte[numBytes];
|
||||
random().nextBytes(bytes);
|
||||
docBytes.add(bytes);
|
||||
Document doc = new Document();
|
||||
BytesRef b = new BytesRef(bytes);
|
||||
b.length = bytes.length;
|
||||
doc.add(new BinaryDocValuesField("field", b));
|
||||
doc.add(new StringField("id", ""+docID, Field.Store.YES));
|
||||
try {
|
||||
w.addDocument(doc);
|
||||
} catch (IllegalArgumentException iae) {
|
||||
if (iae.getMessage().indexOf("is too large") == -1) {
|
||||
throw iae;
|
||||
} else {
|
||||
// OK: some codecs can't handle binary DV > 32K
|
||||
assertFalse(codecAcceptsHugeBinaryValues("field"));
|
||||
w.rollback();
|
||||
d.close();
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DirectoryReader r;
|
||||
try {
|
||||
r = w.getReader();
|
||||
} catch (IllegalArgumentException iae) {
|
||||
if (iae.getMessage().indexOf("is too large") == -1) {
|
||||
throw iae;
|
||||
} else {
|
||||
assertFalse(codecAcceptsHugeBinaryValues("field"));
|
||||
|
||||
// OK: some codecs can't handle binary DV > 32K
|
||||
w.rollback();
|
||||
d.close();
|
||||
return;
|
||||
}
|
||||
}
|
||||
w.close();
|
||||
|
||||
AtomicReader ar = SlowCompositeReaderWrapper.wrap(r);
|
||||
|
||||
BinaryDocValues s = FieldCache.DEFAULT.getTerms(ar, "field");
|
||||
for(int docID=0;docID<docBytes.size();docID++) {
|
||||
StoredDocument doc = ar.document(docID);
|
||||
BytesRef bytes = new BytesRef();
|
||||
s.get(docID, bytes);
|
||||
byte[] expected = docBytes.get(Integer.parseInt(doc.get("id")));
|
||||
assertEquals(expected.length, bytes.length);
|
||||
assertEquals(new BytesRef(expected), bytes);
|
||||
}
|
||||
|
||||
assertTrue(codecAcceptsHugeBinaryValues("field"));
|
||||
|
||||
ar.close();
|
||||
d.close();
|
||||
}
|
||||
|
||||
public void testHugeBinaryValueLimit() throws Exception {
|
||||
// We only test DVFormats that have a limit
|
||||
assumeFalse("test requires codec with limits on max binary field length", codecAcceptsHugeBinaryValues("field"));
|
||||
Analyzer analyzer = new MockAnalyzer(random());
|
||||
// FSDirectory because SimpleText will consume gobbs of
|
||||
// space when storing big binary values:
|
||||
Directory d = newFSDirectory(_TestUtil.getTempDir("hugeBinaryValues"));
|
||||
boolean doFixed = random().nextBoolean();
|
||||
int numDocs;
|
||||
int fixedLength = 0;
|
||||
if (doFixed) {
|
||||
// Sometimes make all values fixed length since some
|
||||
// codecs have different code paths for this:
|
||||
numDocs = _TestUtil.nextInt(random(), 10, 20);
|
||||
fixedLength = Lucene42DocValuesFormat.MAX_BINARY_FIELD_LENGTH;
|
||||
} else {
|
||||
numDocs = _TestUtil.nextInt(random(), 100, 200);
|
||||
}
|
||||
IndexWriter w = new IndexWriter(d, newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
|
||||
List<byte[]> docBytes = new ArrayList<byte[]>();
|
||||
long totalBytes = 0;
|
||||
for(int docID=0;docID<numDocs;docID++) {
|
||||
// we don't use RandomIndexWriter because it might add
|
||||
// more docvalues than we expect !!!!
|
||||
|
||||
// Must be > 64KB in size to ensure more than 2 pages in
|
||||
// PagedBytes would be needed:
|
||||
int numBytes;
|
||||
if (doFixed) {
|
||||
numBytes = fixedLength;
|
||||
} else if (docID == 0 || random().nextInt(5) == 3) {
|
||||
numBytes = Lucene42DocValuesFormat.MAX_BINARY_FIELD_LENGTH;
|
||||
} else {
|
||||
numBytes = _TestUtil.nextInt(random(), 1, Lucene42DocValuesFormat.MAX_BINARY_FIELD_LENGTH);
|
||||
}
|
||||
totalBytes += numBytes;
|
||||
if (totalBytes > 5 * 1024*1024) {
|
||||
break;
|
||||
}
|
||||
byte[] bytes = new byte[numBytes];
|
||||
random().nextBytes(bytes);
|
||||
docBytes.add(bytes);
|
||||
Document doc = new Document();
|
||||
BytesRef b = new BytesRef(bytes);
|
||||
b.length = bytes.length;
|
||||
doc.add(new BinaryDocValuesField("field", b));
|
||||
doc.add(new StringField("id", ""+docID, Field.Store.YES));
|
||||
w.addDocument(doc);
|
||||
}
|
||||
|
||||
DirectoryReader r = w.getReader();
|
||||
w.close();
|
||||
|
||||
AtomicReader ar = SlowCompositeReaderWrapper.wrap(r);
|
||||
|
||||
BinaryDocValues s = FieldCache.DEFAULT.getTerms(ar, "field");
|
||||
for(int docID=0;docID<docBytes.size();docID++) {
|
||||
StoredDocument doc = ar.document(docID);
|
||||
BytesRef bytes = new BytesRef();
|
||||
s.get(docID, bytes);
|
||||
byte[] expected = docBytes.get(Integer.parseInt(doc.get("id")));
|
||||
assertEquals(expected.length, bytes.length);
|
||||
assertEquals(new BytesRef(expected), bytes);
|
||||
}
|
||||
|
||||
ar.close();
|
||||
d.close();
|
||||
}
|
||||
|
||||
protected boolean codecAcceptsHugeBinaryValues(String field) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -45,6 +45,7 @@ import org.apache.lucene.codecs.Codec;
|
|||
import org.apache.lucene.codecs.DocValuesFormat;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.lucene42.Lucene42Codec;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
|
||||
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
|
||||
import org.apache.lucene.document.BinaryDocValuesField;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
@ -58,12 +59,12 @@ import org.apache.lucene.document.NumericDocValuesField;
|
|||
import org.apache.lucene.document.SortedDocValuesField;
|
||||
import org.apache.lucene.index.AtomicReader;
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.CheckIndex;
|
||||
import org.apache.lucene.index.CheckIndex.Status.DocValuesStatus;
|
||||
import org.apache.lucene.index.CheckIndex.Status.FieldNormStatus;
|
||||
import org.apache.lucene.index.CheckIndex.Status.StoredFieldStatus;
|
||||
import org.apache.lucene.index.CheckIndex.Status.TermIndexStatus;
|
||||
import org.apache.lucene.index.CheckIndex.Status.TermVectorStatus;
|
||||
import org.apache.lucene.index.CheckIndex;
|
||||
import org.apache.lucene.index.ConcurrentMergeScheduler;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
|
@ -742,6 +743,25 @@ public class _TestUtil {
|
|||
return p.getName();
|
||||
}
|
||||
}
|
||||
public static String getDocValuesFormat(String field) {
|
||||
return getDocValuesFormat(Codec.getDefault(), field);
|
||||
}
|
||||
|
||||
public static String getDocValuesFormat(Codec codec, String field) {
|
||||
DocValuesFormat f = codec.docValuesFormat();
|
||||
if (f instanceof PerFieldDocValuesFormat) {
|
||||
return ((PerFieldDocValuesFormat) f).getDocValuesFormatForField(field).getName();
|
||||
} else {
|
||||
return f.getName();
|
||||
}
|
||||
}
|
||||
|
||||
public static boolean fieldSupportsHugeBinaryDocValues(String field) {
|
||||
String dvFormat = getDocValuesFormat(field);
|
||||
return dvFormat.equals("CheapBastard") ||
|
||||
dvFormat.equals("Disk") ||
|
||||
dvFormat.equals("SimpleText");
|
||||
}
|
||||
|
||||
public static boolean anyFilesExceptWriteLock(Directory dir) throws IOException {
|
||||
String[] files = dir.listAll();
|
||||
|
|
Loading…
Reference in New Issue