LUCENE-4583: IndexWriter no longer places a limit on length of DV binary fields (individual codecs still have their limits, including the default codec)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1514669 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2013-08-16 12:04:58 +00:00
parent fd9ae25cd1
commit f7d2ac0b0d
17 changed files with 573 additions and 51 deletions

View File

@ -122,6 +122,11 @@ Bug Fixes
boundary, made it into the top-N and went to the formatter.
(Manuel Amoabeng, Michael McCandless, Robert Muir)
* LUCENE-4583: Indexing core no longer enforces a limit on maximum
length binary doc values fields, but individual codecs (including
the default one) have their own limits (David Smiley, Robert Muir,
Mike McCandless)
API Changes
* LUCENE-5094: Add ramBytesUsed() to MultiDocValues.OrdinalMap.

View File

@ -118,6 +118,11 @@ import org.apache.lucene.util.packed.PackedInts;
* {@code BYTES_VAR_DEREF BYTES_VAR_DEREF} it doesn't apply deduplication of the document values.
* </li>
* </ul>
* <p>
* Limitations:
* <ul>
* <li> Binary doc values can be at most {@link #MAX_BINARY_FIELD_LENGTH} in length.
* </ul>
* @deprecated Only for reading old 4.0 and 4.1 segments
*/
@Deprecated
@ -125,6 +130,9 @@ import org.apache.lucene.util.packed.PackedInts;
// for back compat only!
public class Lucene40DocValuesFormat extends DocValuesFormat {
/** Maximum length for each binary doc values field. */
public static final int MAX_BINARY_FIELD_LENGTH = (1 << 15) - 2;
/** Sole constructor. */
public Lucene40DocValuesFormat() {
super("Lucene40");

View File

@ -36,14 +36,14 @@ import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.MathUtil;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.INPUT_TYPE;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
import org.apache.lucene.util.packed.BlockPackedWriter;
import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.PackedInts.FormatAndBits;
import org.apache.lucene.util.packed.PackedInts;
/**
* Writer for {@link Lucene42DocValuesFormat}
@ -216,6 +216,9 @@ class Lucene42DocValuesConsumer extends DocValuesConsumer {
int maxLength = Integer.MIN_VALUE;
final long startFP = data.getFilePointer();
for(BytesRef v : values) {
if (v.length > Lucene42DocValuesFormat.MAX_BINARY_FIELD_LENGTH) {
throw new IllegalArgumentException("DocValuesField \"" + field.name + "\" is too large, must be <= " + Lucene42DocValuesFormat.MAX_BINARY_FIELD_LENGTH);
}
minLength = Math.min(minLength, v.length);
maxLength = Math.max(maxLength, v.length);
data.writeBytes(v.bytes, v.offset, v.length);

View File

@ -118,8 +118,17 @@ import org.apache.lucene.util.packed.BlockPackedWriter;
* <p>SortedSet entries store the list of ordinals in their BinaryData as a
* sequences of increasing {@link DataOutput#writeVLong vLong}s, delta-encoded.</p>
* </ol>
* <p>
* Limitations:
* <ul>
* <li> Binary doc values can be at most {@link #MAX_BINARY_FIELD_LENGTH} in length.
* </ul>
*/
public final class Lucene42DocValuesFormat extends DocValuesFormat {
/** Maximum length for each binary doc values field. */
public static final int MAX_BINARY_FIELD_LENGTH = (1 << 15) - 2;
final float acceptableOverheadRatio;
/**

View File

@ -22,29 +22,43 @@ import java.util.Iterator;
import java.util.NoSuchElementException;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.util.ByteBlockPool.DirectTrackingAllocator;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.PagedBytes;
import org.apache.lucene.util.packed.AppendingDeltaPackedLongBuffer;
import org.apache.lucene.util.packed.PackedInts;
import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
/** Buffers up pending byte[] per doc, then flushes when
* segment flushes. */
class BinaryDocValuesWriter extends DocValuesWriter {
private final ByteBlockPool pool;
/** Maximum length for a binary field; we set this to "a
* bit" below Integer.MAX_VALUE because the exact max
* allowed byte[] is JVM dependent, so we want to avoid
* a case where a large value worked in one JVM but
* failed later at search time with a different JVM. */
private static final int MAX_LENGTH = Integer.MAX_VALUE-256;
// 32 KB block sizes for PagedBytes storage:
private final static int BLOCK_BITS = 15;
private final PagedBytes bytes;
private final DataOutput bytesOut;
private final Counter iwBytesUsed;
private final AppendingDeltaPackedLongBuffer lengths;
private final FieldInfo fieldInfo;
private int addedValues = 0;
private int addedValues;
private long bytesUsed;
public BinaryDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
this.fieldInfo = fieldInfo;
this.pool = new ByteBlockPool(new DirectTrackingAllocator(iwBytesUsed));
this.bytes = new PagedBytes(BLOCK_BITS);
this.bytesOut = bytes.getDataOutput();
this.lengths = new AppendingDeltaPackedLongBuffer(PackedInts.COMPACT);
this.iwBytesUsed = iwBytesUsed;
}
public void addValue(int docID, BytesRef value) {
@ -54,8 +68,8 @@ class BinaryDocValuesWriter extends DocValuesWriter {
if (value == null) {
throw new IllegalArgumentException("field=\"" + fieldInfo.name + "\": null value not allowed");
}
if (value.length > (BYTE_BLOCK_SIZE - 2)) {
throw new IllegalArgumentException("DocValuesField \"" + fieldInfo.name + "\" is too large, must be <= " + (BYTE_BLOCK_SIZE - 2));
if (value.length > MAX_LENGTH) {
throw new IllegalArgumentException("DocValuesField \"" + fieldInfo.name + "\" is too large, must be <= " + MAX_LENGTH);
}
// Fill in any holes:
@ -65,7 +79,19 @@ class BinaryDocValuesWriter extends DocValuesWriter {
}
addedValues++;
lengths.add(value.length);
pool.append(value);
try {
bytesOut.writeBytes(value.bytes, value.offset, value.length);
} catch (IOException ioe) {
// Should never happen!
throw new RuntimeException(ioe);
}
updateBytesUsed();
}
private void updateBytesUsed() {
final long newBytesUsed = lengths.ramBytesUsed() + bytes.ramBytesUsed();
iwBytesUsed.addAndGet(newBytesUsed - bytesUsed);
bytesUsed = newBytesUsed;
}
@Override
@ -75,6 +101,7 @@ class BinaryDocValuesWriter extends DocValuesWriter {
@Override
public void flush(SegmentWriteState state, DocValuesConsumer dvConsumer) throws IOException {
final int maxDoc = state.segmentInfo.getDocCount();
bytes.freeze(false);
dvConsumer.addBinaryField(fieldInfo,
new Iterable<BytesRef>() {
@Override
@ -92,10 +119,10 @@ class BinaryDocValuesWriter extends DocValuesWriter {
private class BytesIterator implements Iterator<BytesRef> {
final BytesRef value = new BytesRef();
final AppendingDeltaPackedLongBuffer.Iterator lengthsIterator = lengths.iterator();
final DataInput bytesIterator = bytes.getDataInput();
final int size = (int) lengths.size();
final int maxDoc;
int upto;
long byteOffset;
BytesIterator(int maxDoc) {
this.maxDoc = maxDoc;
@ -115,8 +142,12 @@ class BinaryDocValuesWriter extends DocValuesWriter {
int length = (int) lengthsIterator.next();
value.grow(length);
value.length = length;
pool.readBytes(byteOffset, value.bytes, value.offset, value.length);
byteOffset += length;
try {
bytesIterator.readBytes(value.bytes, value.offset, value.length);
} catch (IOException ioe) {
// Should never happen!
throw new RuntimeException(ioe);
}
} else {
// This is to handle last N documents not having
// this DV field in the end of the segment:

View File

@ -92,21 +92,22 @@ public final class FieldInfo {
*/
NUMERIC,
/**
* A per-document byte[].
* A per-document byte[]. Values may be larger than
* 32766 bytes, but different codecs may enforce their own limits.
*/
BINARY,
/**
* A pre-sorted byte[]. Fields with this type only store distinct byte values
* and store an additional offset pointer per document to dereference the shared
* byte[]. The stored byte[] is presorted and allows access via document id,
* ordinal and by-value.
* ordinal and by-value. Values must be <= 32766 bytes.
*/
SORTED,
/**
* A pre-sorted Set&lt;byte[]&gt;. Fields with this type only store distinct byte values
* and store additional offset pointers per document to dereference the shared
* byte[]s. The stored byte[] is presorted and allows access via document id,
* ordinal and by-value.
* ordinal and by-value. Values must be <= 32766 bytes.
*/
SORTED_SET
};

View File

@ -21,6 +21,8 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexInput;
/** Represents a logical byte[] as a series of pages. You
@ -34,6 +36,7 @@ import org.apache.lucene.store.IndexInput;
// other "shift/mask big arrays". there are too many of these classes!
public final class PagedBytes {
private final List<byte[]> blocks = new ArrayList<byte[]>();
// TODO: these are unused?
private final List<Integer> blockEnd = new ArrayList<Integer>();
private final int blockSize;
private final int blockBits;
@ -42,6 +45,7 @@ public final class PagedBytes {
private boolean frozen;
private int upto;
private byte[] currentBlock;
private final long bytesUsedPerBlock;
private static final byte[] EMPTY_BYTES = new byte[0];
@ -75,13 +79,13 @@ public final class PagedBytes {
* given length. Iff the slice spans across a block border this method will
* allocate sufficient resources and copy the paged data.
* <p>
* Slices spanning more than one block are not supported.
* Slices spanning more than two blocks are not supported.
* </p>
* @lucene.internal
**/
public void fillSlice(BytesRef b, long start, int length) {
assert length >= 0: "length=" + length;
assert length <= blockSize+1;
assert length <= blockSize+1: "length=" + length;
final int index = (int) (start >> blockBits);
final int offset = (int) (start & blockMask);
b.length = length;
@ -132,6 +136,7 @@ public final class PagedBytes {
this.blockBits = blockBits;
blockMask = blockSize-1;
upto = blockSize;
bytesUsedPerBlock = blockSize + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + RamUsageEstimator.NUM_BYTES_OBJECT_REF;
}
/** Read this many bytes from in */
@ -216,6 +221,11 @@ public final class PagedBytes {
}
}
/** Return approx RAM usage in bytes. */
public long ramBytesUsed() {
return (blocks.size() + (currentBlock != null ? 1 : 0)) * bytesUsedPerBlock;
}
/** Copy bytes in, writing the length as a 1 or 2 byte
* vInt prefix. */
// TODO: this really needs to be refactored into fieldcacheimpl!
@ -249,4 +259,148 @@ public final class PagedBytes {
return pointer;
}
public final class PagedBytesDataInput extends DataInput {
private int currentBlockIndex;
private int currentBlockUpto;
private byte[] currentBlock;
PagedBytesDataInput() {
currentBlock = blocks.get(0);
}
@Override
public PagedBytesDataInput clone() {
PagedBytesDataInput clone = getDataInput();
clone.setPosition(getPosition());
return clone;
}
/** Returns the current byte position. */
public long getPosition() {
return (long) currentBlockIndex * blockSize + currentBlockUpto;
}
/** Seek to a position previously obtained from
* {@link #getPosition}. */
public void setPosition(long pos) {
currentBlockIndex = (int) (pos >> blockBits);
currentBlock = blocks.get(currentBlockIndex);
currentBlockUpto = (int) (pos & blockMask);
}
@Override
public byte readByte() {
if (currentBlockUpto == blockSize) {
nextBlock();
}
return currentBlock[currentBlockUpto++];
}
@Override
public void readBytes(byte[] b, int offset, int len) {
assert b.length >= offset + len;
final int offsetEnd = offset + len;
while (true) {
final int blockLeft = blockSize - currentBlockUpto;
final int left = offsetEnd - offset;
if (blockLeft < left) {
System.arraycopy(currentBlock, currentBlockUpto,
b, offset,
blockLeft);
nextBlock();
offset += blockLeft;
} else {
// Last block
System.arraycopy(currentBlock, currentBlockUpto,
b, offset,
left);
currentBlockUpto += left;
break;
}
}
}
private void nextBlock() {
currentBlockIndex++;
currentBlockUpto = 0;
currentBlock = blocks.get(currentBlockIndex);
}
}
public final class PagedBytesDataOutput extends DataOutput {
@Override
public void writeByte(byte b) {
if (upto == blockSize) {
if (currentBlock != null) {
blocks.add(currentBlock);
blockEnd.add(upto);
}
currentBlock = new byte[blockSize];
upto = 0;
}
currentBlock[upto++] = b;
}
@Override
public void writeBytes(byte[] b, int offset, int length) {
assert b.length >= offset + length;
if (length == 0) {
return;
}
if (upto == blockSize) {
if (currentBlock != null) {
blocks.add(currentBlock);
blockEnd.add(upto);
}
currentBlock = new byte[blockSize];
upto = 0;
}
final int offsetEnd = offset + length;
while(true) {
final int left = offsetEnd - offset;
final int blockLeft = blockSize - upto;
if (blockLeft < left) {
System.arraycopy(b, offset, currentBlock, upto, blockLeft);
blocks.add(currentBlock);
blockEnd.add(blockSize);
currentBlock = new byte[blockSize];
upto = 0;
offset += blockLeft;
} else {
// Last block
System.arraycopy(b, offset, currentBlock, upto, left);
upto += left;
break;
}
}
}
/** Return the current byte position. */
public long getPosition() {
return getPointer();
}
}
/** Returns a DataInput to read values from this
* PagedBytes instance. */
public PagedBytesDataInput getDataInput() {
if (!frozen) {
throw new IllegalStateException("must call freeze() before getDataInput");
}
return new PagedBytesDataInput();
}
/** Returns a DataOutput that you may use to write into
* this PagedBytes instance. If you do this, you should
* not call the other writing methods (eg, copy);
* results are undefined. */
public PagedBytesDataOutput getDataOutput() {
if (frozen) {
throw new IllegalStateException("cannot get DataOutput after freeze()");
}
return new PagedBytesDataOutput();
}
}

View File

@ -31,4 +31,10 @@ public class TestLucene40DocValuesFormat extends BaseDocValuesFormatTestCase {
return codec;
}
// LUCENE-4583: This codec should throw IAE on huge binary values:
@Override
protected boolean codecAcceptsHugeBinaryValues(String field) {
return false;
}
}

View File

@ -30,4 +30,9 @@ public class TestLucene42DocValuesFormat extends BaseCompressingDocValuesFormatT
protected Codec getCodec() {
return codec;
}
@Override
protected boolean codecAcceptsHugeBinaryValues(String field) {
return false;
}
}

View File

@ -46,6 +46,7 @@ import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util._TestUtil;
/**
* Basic tests of PerFieldDocValuesFormat
@ -64,6 +65,11 @@ public class TestPerFieldDocValuesFormat extends BaseDocValuesFormatTestCase {
return codec;
}
@Override
protected boolean codecAcceptsHugeBinaryValues(String field) {
return _TestUtil.fieldSupportsHugeBinaryDocValues(field);
}
// just a simple trivial test
// TODO: we should come up with a test that somehow checks that segment suffix
// is respected by all codec apis (not just docvalues and postings)

View File

@ -18,6 +18,7 @@ package org.apache.lucene.index;
*/
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.util._TestUtil;
/** Tests the codec configuration defined by LuceneTestCase randomly
* (typically a mix across different fields).
@ -28,4 +29,9 @@ public class TestDocValuesFormat extends BaseDocValuesFormatTestCase {
protected Codec getCodec() {
return Codec.getDefault();
}
@Override
protected boolean codecAcceptsHugeBinaryValues(String field) {
return _TestUtil.fieldSupportsHugeBinaryDocValues(field);
}
}

View File

@ -327,30 +327,6 @@ public class TestDocValuesIndexing extends LuceneTestCase {
directory.close();
}
public void testTooLargeBytes() throws IOException {
Analyzer analyzer = new MockAnalyzer(random());
Directory directory = newDirectory();
// we don't use RandomIndexWriter because it might add more docvalues than we expect !!!!1
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
iwc.setMergePolicy(newLogMergePolicy());
IndexWriter iwriter = new IndexWriter(directory, iwc);
Document doc = new Document();
byte bytes[] = new byte[100000];
BytesRef b = new BytesRef(bytes);
random().nextBytes(bytes);
doc.add(new BinaryDocValuesField("dv", b));
try {
iwriter.addDocument(doc);
fail("did not get expected exception");
} catch (IllegalArgumentException expected) {
// expected
}
iwriter.close();
directory.close();
}
public void testTooLargeSortedBytes() throws IOException {
Analyzer analyzer = new MockAnalyzer(random());

View File

@ -22,6 +22,7 @@ import java.util.*;
import org.apache.lucene.store.BaseDirectoryWrapper;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
@ -30,6 +31,9 @@ import org.junit.Ignore;
public class TestPagedBytes extends LuceneTestCase {
// Writes random byte/s to "normal" file in dir, then
// copies into PagedBytes and verifies with
// PagedBytes.Reader:
public void testDataInputOutput() throws Exception {
Random random = random();
for(int iter=0;iter<5*RANDOM_MULTIPLIER;iter++) {
@ -90,6 +94,60 @@ public class TestPagedBytes extends LuceneTestCase {
}
}
// Writes random byte/s into PagedBytes via
// .getDataOutput(), then verifies with
// PagedBytes.getDataInput():
public void testDataInputOutput2() throws Exception {
Random random = random();
for(int iter=0;iter<5*RANDOM_MULTIPLIER;iter++) {
final int blockBits = _TestUtil.nextInt(random, 1, 20);
final int blockSize = 1 << blockBits;
final PagedBytes p = new PagedBytes(blockBits);
final DataOutput out = p.getDataOutput();
final int numBytes = random().nextInt(10000000);
final byte[] answer = new byte[numBytes];
random().nextBytes(answer);
int written = 0;
while(written < numBytes) {
if (random().nextInt(10) == 7) {
out.writeByte(answer[written++]);
} else {
int chunk = Math.min(random().nextInt(1000), numBytes - written);
out.writeBytes(answer, written, chunk);
written += chunk;
}
}
final PagedBytes.Reader reader = p.freeze(random.nextBoolean());
final DataInput in = p.getDataInput();
final byte[] verify = new byte[numBytes];
int read = 0;
while(read < numBytes) {
if (random().nextInt(10) == 7) {
verify[read++] = in.readByte();
} else {
int chunk = Math.min(random().nextInt(1000), numBytes - read);
in.readBytes(verify, read, chunk);
read += chunk;
}
}
assertTrue(Arrays.equals(answer, verify));
final BytesRef slice = new BytesRef();
for(int iter2=0;iter2<100;iter2++) {
final int pos = random.nextInt(numBytes-1);
final int len = random.nextInt(Math.min(blockSize+1, numBytes - pos));
reader.fillSlice(slice, pos, len);
for(int byteUpto=0;byteUpto<len;byteUpto++) {
assertEquals(answer[pos + byteUpto], slice.bytes[slice.offset + byteUpto]);
}
}
}
}
@Ignore // memory hole
public void testOverflow() throws IOException {
BaseDirectoryWrapper dir = newFSDirectory(_TestUtil.getTempDir("testOverflow"));
@ -126,4 +184,5 @@ public class TestPagedBytes extends LuceneTestCase {
in.close();
dir.close();
}
}

View File

@ -22,13 +22,16 @@ import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.facet.FacetTestCase;
import org.apache.lucene.facet.FacetTestUtils;
import org.apache.lucene.facet.codecs.facet42.Facet42Codec;
import org.apache.lucene.facet.index.FacetFields;
import org.apache.lucene.facet.params.CategoryListParams;
import org.apache.lucene.facet.params.FacetIndexingParams;
@ -48,6 +51,8 @@ import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util._TestUtil;
public class TestDemoFacets extends FacetTestCase {
@ -248,4 +253,60 @@ public class TestDemoFacets extends FacetTestCase {
dir.close();
taxoDir.close();
}
// LUCENE-4583: make sure if we require > 32 KB for one
// document, we don't hit exc when using Facet42DocValuesFormat
public void testManyFacetsInOneDocument() throws Exception {
Directory dir = newDirectory();
Directory taxoDir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
iwc.setCodec(new Facet42Codec());
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE);
FacetFields facetFields = new FacetFields(taxoWriter);
int numLabels = _TestUtil.nextInt(random(), 40000, 100000);
Document doc = new Document();
doc.add(newTextField("field", "text", Field.Store.NO));
List<CategoryPath> paths = new ArrayList<CategoryPath>();
for(int i=0;i<numLabels;i++) {
paths.add(new CategoryPath("dim", "" + i));
}
facetFields.addFields(doc, paths);
writer.addDocument(doc);
// NRT open
IndexSearcher searcher = newSearcher(writer.getReader());
writer.close();
// NRT open
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter);
taxoWriter.close();
FacetSearchParams fsp = new FacetSearchParams(new CountFacetRequest(new CategoryPath("dim"), Integer.MAX_VALUE));
// Aggregate the facet counts:
FacetsCollector c = FacetsCollector.create(fsp, searcher.getIndexReader(), taxoReader);
// MatchAllDocsQuery is for "browsing" (counts facets
// for all non-deleted docs in the index); normally
// you'd use a "normal" query, and use MultiCollector to
// wrap collecting the "normal" hits and also facets:
searcher.search(new MatchAllDocsQuery(), c);
List<FacetResult> results = c.getFacetResults();
assertEquals(1, results.size());
FacetResultNode root = results.get(0).getFacetResultNode();
assertEquals(numLabels, root.subResults.size());
Set<String> allLabels = new HashSet<String>();
for(FacetResultNode childNode : root.subResults) {
assertEquals(2, childNode.label.length);
allLabels.add(childNode.label.components[1]);
assertEquals(1, (int) childNode.value);
}
assertEquals(numLabels, allLabels.size());
IOUtils.close(searcher.getIndexReader(), taxoReader, dir, taxoDir);
}
}

View File

@ -156,6 +156,9 @@ class Lucene40DocValuesWriter extends DocValuesConsumer {
int minLength = Integer.MAX_VALUE;
int maxLength = Integer.MIN_VALUE;
for (BytesRef b : values) {
if (b.length > Lucene40DocValuesFormat.MAX_BINARY_FIELD_LENGTH) {
throw new IllegalArgumentException("DocValuesField \"" + field.name + "\" is too large, must be <= " + Lucene40DocValuesFormat.MAX_BINARY_FIELD_LENGTH);
}
minLength = Math.min(minLength, b.length);
maxLength = Math.max(maxLength, b.length);
if (uniqueValues != null) {

View File

@ -17,21 +17,20 @@ package org.apache.lucene.index;
* limitations under the License.
*/
import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.lucene42.Lucene42DocValuesFormat;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@ -57,6 +56,8 @@ import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS;
/**
* Abstract class to do basic tests for a docvalues format.
* NOTE: This test focuses on the docvalues impl, nothing else.
@ -2198,4 +2199,172 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase {
}
}
// LUCENE-4853
public void testHugeBinaryValues() throws Exception {
Analyzer analyzer = new MockAnalyzer(random());
// FSDirectory because SimpleText will consume gobbs of
// space when storing big binary values:
Directory d = newFSDirectory(_TestUtil.getTempDir("hugeBinaryValues"));
boolean doFixed = random().nextBoolean();
int numDocs;
int fixedLength = 0;
if (doFixed) {
// Sometimes make all values fixed length since some
// codecs have different code paths for this:
numDocs = _TestUtil.nextInt(random(), 10, 20);
fixedLength = _TestUtil.nextInt(random(), 65537, 256*1024);
} else {
numDocs = _TestUtil.nextInt(random(), 100, 200);
}
IndexWriter w = new IndexWriter(d, newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
List<byte[]> docBytes = new ArrayList<byte[]>();
long totalBytes = 0;
for(int docID=0;docID<numDocs;docID++) {
// we don't use RandomIndexWriter because it might add
// more docvalues than we expect !!!!
// Must be > 64KB in size to ensure more than 2 pages in
// PagedBytes would be needed:
int numBytes;
if (doFixed) {
numBytes = fixedLength;
} else if (docID == 0 || random().nextInt(5) == 3) {
numBytes = _TestUtil.nextInt(random(), 65537, 3*1024*1024);
} else {
numBytes = _TestUtil.nextInt(random(), 1, 1024*1024);
}
totalBytes += numBytes;
if (totalBytes > 5 * 1024*1024) {
break;
}
byte[] bytes = new byte[numBytes];
random().nextBytes(bytes);
docBytes.add(bytes);
Document doc = new Document();
BytesRef b = new BytesRef(bytes);
b.length = bytes.length;
doc.add(new BinaryDocValuesField("field", b));
doc.add(new StringField("id", ""+docID, Field.Store.YES));
try {
w.addDocument(doc);
} catch (IllegalArgumentException iae) {
if (iae.getMessage().indexOf("is too large") == -1) {
throw iae;
} else {
// OK: some codecs can't handle binary DV > 32K
assertFalse(codecAcceptsHugeBinaryValues("field"));
w.rollback();
d.close();
return;
}
}
}
DirectoryReader r;
try {
r = w.getReader();
} catch (IllegalArgumentException iae) {
if (iae.getMessage().indexOf("is too large") == -1) {
throw iae;
} else {
assertFalse(codecAcceptsHugeBinaryValues("field"));
// OK: some codecs can't handle binary DV > 32K
w.rollback();
d.close();
return;
}
}
w.close();
AtomicReader ar = SlowCompositeReaderWrapper.wrap(r);
BinaryDocValues s = FieldCache.DEFAULT.getTerms(ar, "field");
for(int docID=0;docID<docBytes.size();docID++) {
StoredDocument doc = ar.document(docID);
BytesRef bytes = new BytesRef();
s.get(docID, bytes);
byte[] expected = docBytes.get(Integer.parseInt(doc.get("id")));
assertEquals(expected.length, bytes.length);
assertEquals(new BytesRef(expected), bytes);
}
assertTrue(codecAcceptsHugeBinaryValues("field"));
ar.close();
d.close();
}
public void testHugeBinaryValueLimit() throws Exception {
// We only test DVFormats that have a limit
assumeFalse("test requires codec with limits on max binary field length", codecAcceptsHugeBinaryValues("field"));
Analyzer analyzer = new MockAnalyzer(random());
// FSDirectory because SimpleText will consume gobbs of
// space when storing big binary values:
Directory d = newFSDirectory(_TestUtil.getTempDir("hugeBinaryValues"));
boolean doFixed = random().nextBoolean();
int numDocs;
int fixedLength = 0;
if (doFixed) {
// Sometimes make all values fixed length since some
// codecs have different code paths for this:
numDocs = _TestUtil.nextInt(random(), 10, 20);
fixedLength = Lucene42DocValuesFormat.MAX_BINARY_FIELD_LENGTH;
} else {
numDocs = _TestUtil.nextInt(random(), 100, 200);
}
IndexWriter w = new IndexWriter(d, newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
List<byte[]> docBytes = new ArrayList<byte[]>();
long totalBytes = 0;
for(int docID=0;docID<numDocs;docID++) {
// we don't use RandomIndexWriter because it might add
// more docvalues than we expect !!!!
// Must be > 64KB in size to ensure more than 2 pages in
// PagedBytes would be needed:
int numBytes;
if (doFixed) {
numBytes = fixedLength;
} else if (docID == 0 || random().nextInt(5) == 3) {
numBytes = Lucene42DocValuesFormat.MAX_BINARY_FIELD_LENGTH;
} else {
numBytes = _TestUtil.nextInt(random(), 1, Lucene42DocValuesFormat.MAX_BINARY_FIELD_LENGTH);
}
totalBytes += numBytes;
if (totalBytes > 5 * 1024*1024) {
break;
}
byte[] bytes = new byte[numBytes];
random().nextBytes(bytes);
docBytes.add(bytes);
Document doc = new Document();
BytesRef b = new BytesRef(bytes);
b.length = bytes.length;
doc.add(new BinaryDocValuesField("field", b));
doc.add(new StringField("id", ""+docID, Field.Store.YES));
w.addDocument(doc);
}
DirectoryReader r = w.getReader();
w.close();
AtomicReader ar = SlowCompositeReaderWrapper.wrap(r);
BinaryDocValues s = FieldCache.DEFAULT.getTerms(ar, "field");
for(int docID=0;docID<docBytes.size();docID++) {
StoredDocument doc = ar.document(docID);
BytesRef bytes = new BytesRef();
s.get(docID, bytes);
byte[] expected = docBytes.get(Integer.parseInt(doc.get("id")));
assertEquals(expected.length, bytes.length);
assertEquals(new BytesRef(expected), bytes);
}
ar.close();
d.close();
}
protected boolean codecAcceptsHugeBinaryValues(String field) {
return true;
}
}

View File

@ -45,6 +45,7 @@ import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.lucene42.Lucene42Codec;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
@ -58,12 +59,12 @@ import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.CheckIndex;
import org.apache.lucene.index.CheckIndex.Status.DocValuesStatus;
import org.apache.lucene.index.CheckIndex.Status.FieldNormStatus;
import org.apache.lucene.index.CheckIndex.Status.StoredFieldStatus;
import org.apache.lucene.index.CheckIndex.Status.TermIndexStatus;
import org.apache.lucene.index.CheckIndex.Status.TermVectorStatus;
import org.apache.lucene.index.CheckIndex;
import org.apache.lucene.index.ConcurrentMergeScheduler;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
@ -742,6 +743,25 @@ public class _TestUtil {
return p.getName();
}
}
public static String getDocValuesFormat(String field) {
return getDocValuesFormat(Codec.getDefault(), field);
}
public static String getDocValuesFormat(Codec codec, String field) {
DocValuesFormat f = codec.docValuesFormat();
if (f instanceof PerFieldDocValuesFormat) {
return ((PerFieldDocValuesFormat) f).getDocValuesFormatForField(field).getName();
} else {
return f.getName();
}
}
public static boolean fieldSupportsHugeBinaryDocValues(String field) {
String dvFormat = getDocValuesFormat(field);
return dvFormat.equals("CheapBastard") ||
dvFormat.equals("Disk") ||
dvFormat.equals("SimpleText");
}
public static boolean anyFilesExceptWriteLock(Directory dir) throws IOException {
String[] files = dir.listAll();