diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 87a8d963634..c49e1fd5ebb 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -122,6 +122,11 @@ Bug Fixes boundary, made it into the top-N and went to the formatter. (Manuel Amoabeng, Michael McCandless, Robert Muir) +* LUCENE-4583: Indexing core no longer enforces a limit on maximum + length binary doc values fields, but individual codecs (including + the default one) have their own limits (David Smiley, Robert Muir, + Mike McCandless) + API Changes * LUCENE-5094: Add ramBytesUsed() to MultiDocValues.OrdinalMap. diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesFormat.java index 6cbf7f8727e..4f350efc320 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesFormat.java @@ -118,6 +118,11 @@ import org.apache.lucene.util.packed.PackedInts; * {@code BYTES_VAR_DEREF BYTES_VAR_DEREF} it doesn't apply deduplication of the document values. * * + *

+ * Limitations: + *

* @deprecated Only for reading old 4.0 and 4.1 segments */ @Deprecated @@ -125,6 +130,9 @@ import org.apache.lucene.util.packed.PackedInts; // for back compat only! public class Lucene40DocValuesFormat extends DocValuesFormat { + /** Maximum length for each binary doc values field. */ + public static final int MAX_BINARY_FIELD_LENGTH = (1 << 15) - 2; + /** Sole constructor. */ public Lucene40DocValuesFormat() { super("Lucene40"); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java index edd3bbd240a..0a2f92f22a8 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesConsumer.java @@ -37,14 +37,14 @@ import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.MathUtil; import org.apache.lucene.util.fst.Builder; -import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST.INPUT_TYPE; +import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.PositiveIntOutputs; import org.apache.lucene.util.fst.Util; import org.apache.lucene.util.packed.BlockPackedWriter; import org.apache.lucene.util.packed.MonotonicBlockPackedWriter; -import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PackedInts.FormatAndBits; +import org.apache.lucene.util.packed.PackedInts; /** * Writer for {@link Lucene42DocValuesFormat} @@ -220,6 +220,9 @@ class Lucene42DocValuesConsumer extends DocValuesConsumer { final long startFP = data.getFilePointer(); for(BytesRef v : values) { final int length = v == null ? 0 : v.length; + if (length > Lucene42DocValuesFormat.MAX_BINARY_FIELD_LENGTH) { + throw new IllegalArgumentException("DocValuesField \"" + field.name + "\" is too large, must be <= " + Lucene42DocValuesFormat.MAX_BINARY_FIELD_LENGTH); + } minLength = Math.min(minLength, length); maxLength = Math.max(maxLength, length); if (v != null) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java index 3e81004493d..55bf8097561 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene42/Lucene42DocValuesFormat.java @@ -118,8 +118,17 @@ import org.apache.lucene.util.packed.BlockPackedWriter; *

SortedSet entries store the list of ordinals in their BinaryData as a * sequences of increasing {@link DataOutput#writeVLong vLong}s, delta-encoded.

* + *

+ * Limitations: + *

*/ public final class Lucene42DocValuesFormat extends DocValuesFormat { + + /** Maximum length for each binary doc values field. */ + public static final int MAX_BINARY_FIELD_LENGTH = (1 << 15) - 2; + final float acceptableOverheadRatio; /** diff --git a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java index 553f9ff1ff7..3a3e301a9b2 100644 --- a/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/BinaryDocValuesWriter.java @@ -22,33 +22,44 @@ import java.util.Iterator; import java.util.NoSuchElementException; import org.apache.lucene.codecs.DocValuesConsumer; -import org.apache.lucene.util.ByteBlockPool.DirectTrackingAllocator; -import org.apache.lucene.util.ByteBlockPool; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Counter; import org.apache.lucene.util.OpenBitSet; import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.PagedBytes; import org.apache.lucene.util.packed.AppendingDeltaPackedLongBuffer; import org.apache.lucene.util.packed.PackedInts; -import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE; - - /** Buffers up pending byte[] per doc, then flushes when * segment flushes. */ class BinaryDocValuesWriter extends DocValuesWriter { - private final ByteBlockPool pool; + /** Maximum length for a binary field; we set this to "a + * bit" below Integer.MAX_VALUE because the exact max + * allowed byte[] is JVM dependent, so we want to avoid + * a case where a large value worked in one JVM but + * failed later at search time with a different JVM. */ + private static final int MAX_LENGTH = Integer.MAX_VALUE-256; + + // 32 KB block sizes for PagedBytes storage: + private final static int BLOCK_BITS = 15; + + private final PagedBytes bytes; + private final DataOutput bytesOut; + + private final Counter iwBytesUsed; private final AppendingDeltaPackedLongBuffer lengths; private final OpenBitSet docsWithField; - private final Counter iwBytesUsed; - private long bytesUsed; private final FieldInfo fieldInfo; - private int addedValues = 0; + private int addedValues; + private long bytesUsed; public BinaryDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) { this.fieldInfo = fieldInfo; - this.pool = new ByteBlockPool(new DirectTrackingAllocator(iwBytesUsed)); + this.bytes = new PagedBytes(BLOCK_BITS); + this.bytesOut = bytes.getDataOutput(); this.lengths = new AppendingDeltaPackedLongBuffer(PackedInts.COMPACT); this.iwBytesUsed = iwBytesUsed; this.docsWithField = new OpenBitSet(); @@ -63,10 +74,10 @@ class BinaryDocValuesWriter extends DocValuesWriter { if (value == null) { throw new IllegalArgumentException("field=\"" + fieldInfo.name + "\": null value not allowed"); } - if (value.length > (BYTE_BLOCK_SIZE - 2)) { - throw new IllegalArgumentException("DocValuesField \"" + fieldInfo.name + "\" is too large, must be <= " + (BYTE_BLOCK_SIZE - 2)); + if (value.length > MAX_LENGTH) { + throw new IllegalArgumentException("DocValuesField \"" + fieldInfo.name + "\" is too large, must be <= " + MAX_LENGTH); } - + // Fill in any holes: while(addedValues < docID) { addedValues++; @@ -74,7 +85,12 @@ class BinaryDocValuesWriter extends DocValuesWriter { } addedValues++; lengths.add(value.length); - pool.append(value); + try { + bytesOut.writeBytes(value.bytes, value.offset, value.length); + } catch (IOException ioe) { + // Should never happen! + throw new RuntimeException(ioe); + } docsWithField.set(docID); updateBytesUsed(); } @@ -83,9 +99,9 @@ class BinaryDocValuesWriter extends DocValuesWriter { // nocommit: this is not correct return docsWithField.getBits().length*RamUsageEstimator.NUM_BYTES_LONG; } - + private void updateBytesUsed() { - final long newBytesUsed = docsWithFieldBytesUsed(); + final long newBytesUsed = lengths.ramBytesUsed() + bytes.ramBytesUsed() + docsWithFieldBytesUsed(); iwBytesUsed.addAndGet(newBytesUsed - bytesUsed); bytesUsed = newBytesUsed; } @@ -97,6 +113,7 @@ class BinaryDocValuesWriter extends DocValuesWriter { @Override public void flush(SegmentWriteState state, DocValuesConsumer dvConsumer) throws IOException { final int maxDoc = state.segmentInfo.getDocCount(); + bytes.freeze(false); dvConsumer.addBinaryField(fieldInfo, new Iterable() { @Override @@ -114,10 +131,10 @@ class BinaryDocValuesWriter extends DocValuesWriter { private class BytesIterator implements Iterator { final BytesRef value = new BytesRef(); final AppendingDeltaPackedLongBuffer.Iterator lengthsIterator = lengths.iterator(); + final DataInput bytesIterator = bytes.getDataInput(); final int size = (int) lengths.size(); final int maxDoc; int upto; - long byteOffset; BytesIterator(int maxDoc) { this.maxDoc = maxDoc; @@ -138,8 +155,12 @@ class BinaryDocValuesWriter extends DocValuesWriter { int length = (int) lengthsIterator.next(); value.grow(length); value.length = length; - pool.readBytes(byteOffset, value.bytes, value.offset, value.length); - byteOffset += length; + try { + bytesIterator.readBytes(value.bytes, value.offset, value.length); + } catch (IOException ioe) { + // Should never happen! + throw new RuntimeException(ioe); + } if (docsWithField.get(upto)) { v = value; } else { diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java index 3cba54ea52f..57b06e8c817 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java @@ -92,21 +92,22 @@ public final class FieldInfo { */ NUMERIC, /** - * A per-document byte[]. + * A per-document byte[]. Values may be larger than + * 32766 bytes, but different codecs may enforce their own limits. */ BINARY, /** * A pre-sorted byte[]. Fields with this type only store distinct byte values * and store an additional offset pointer per document to dereference the shared * byte[]. The stored byte[] is presorted and allows access via document id, - * ordinal and by-value. + * ordinal and by-value. Values must be <= 32766 bytes. */ SORTED, /** * A pre-sorted Set<byte[]>. Fields with this type only store distinct byte values * and store additional offset pointers per document to dereference the shared * byte[]s. The stored byte[] is presorted and allows access via document id, - * ordinal and by-value. + * ordinal and by-value. Values must be <= 32766 bytes. */ SORTED_SET }; diff --git a/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java b/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java index 488d9d2b36f..429d274dad5 100644 --- a/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java +++ b/lucene/core/src/java/org/apache/lucene/util/PagedBytes.java @@ -21,6 +21,8 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IndexInput; /** Represents a logical byte[] as a series of pages. You @@ -34,6 +36,7 @@ import org.apache.lucene.store.IndexInput; // other "shift/mask big arrays". there are too many of these classes! public final class PagedBytes { private final List blocks = new ArrayList(); + // TODO: these are unused? private final List blockEnd = new ArrayList(); private final int blockSize; private final int blockBits; @@ -42,6 +45,7 @@ public final class PagedBytes { private boolean frozen; private int upto; private byte[] currentBlock; + private final long bytesUsedPerBlock; private static final byte[] EMPTY_BYTES = new byte[0]; @@ -75,13 +79,13 @@ public final class PagedBytes { * given length. Iff the slice spans across a block border this method will * allocate sufficient resources and copy the paged data. *

- * Slices spanning more than one block are not supported. + * Slices spanning more than two blocks are not supported. *

* @lucene.internal **/ public void fillSlice(BytesRef b, long start, int length) { assert length >= 0: "length=" + length; - assert length <= blockSize+1; + assert length <= blockSize+1: "length=" + length; final int index = (int) (start >> blockBits); final int offset = (int) (start & blockMask); b.length = length; @@ -132,6 +136,7 @@ public final class PagedBytes { this.blockBits = blockBits; blockMask = blockSize-1; upto = blockSize; + bytesUsedPerBlock = blockSize + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + RamUsageEstimator.NUM_BYTES_OBJECT_REF; } /** Read this many bytes from in */ @@ -216,6 +221,11 @@ public final class PagedBytes { } } + /** Return approx RAM usage in bytes. */ + public long ramBytesUsed() { + return (blocks.size() + (currentBlock != null ? 1 : 0)) * bytesUsedPerBlock; + } + /** Copy bytes in, writing the length as a 1 or 2 byte * vInt prefix. */ // TODO: this really needs to be refactored into fieldcacheimpl! @@ -249,4 +259,148 @@ public final class PagedBytes { return pointer; } + + public final class PagedBytesDataInput extends DataInput { + private int currentBlockIndex; + private int currentBlockUpto; + private byte[] currentBlock; + + PagedBytesDataInput() { + currentBlock = blocks.get(0); + } + + @Override + public PagedBytesDataInput clone() { + PagedBytesDataInput clone = getDataInput(); + clone.setPosition(getPosition()); + return clone; + } + + /** Returns the current byte position. */ + public long getPosition() { + return (long) currentBlockIndex * blockSize + currentBlockUpto; + } + + /** Seek to a position previously obtained from + * {@link #getPosition}. */ + public void setPosition(long pos) { + currentBlockIndex = (int) (pos >> blockBits); + currentBlock = blocks.get(currentBlockIndex); + currentBlockUpto = (int) (pos & blockMask); + } + + @Override + public byte readByte() { + if (currentBlockUpto == blockSize) { + nextBlock(); + } + return currentBlock[currentBlockUpto++]; + } + + @Override + public void readBytes(byte[] b, int offset, int len) { + assert b.length >= offset + len; + final int offsetEnd = offset + len; + while (true) { + final int blockLeft = blockSize - currentBlockUpto; + final int left = offsetEnd - offset; + if (blockLeft < left) { + System.arraycopy(currentBlock, currentBlockUpto, + b, offset, + blockLeft); + nextBlock(); + offset += blockLeft; + } else { + // Last block + System.arraycopy(currentBlock, currentBlockUpto, + b, offset, + left); + currentBlockUpto += left; + break; + } + } + } + + private void nextBlock() { + currentBlockIndex++; + currentBlockUpto = 0; + currentBlock = blocks.get(currentBlockIndex); + } + } + + public final class PagedBytesDataOutput extends DataOutput { + @Override + public void writeByte(byte b) { + if (upto == blockSize) { + if (currentBlock != null) { + blocks.add(currentBlock); + blockEnd.add(upto); + } + currentBlock = new byte[blockSize]; + upto = 0; + } + currentBlock[upto++] = b; + } + + @Override + public void writeBytes(byte[] b, int offset, int length) { + assert b.length >= offset + length; + if (length == 0) { + return; + } + + if (upto == blockSize) { + if (currentBlock != null) { + blocks.add(currentBlock); + blockEnd.add(upto); + } + currentBlock = new byte[blockSize]; + upto = 0; + } + + final int offsetEnd = offset + length; + while(true) { + final int left = offsetEnd - offset; + final int blockLeft = blockSize - upto; + if (blockLeft < left) { + System.arraycopy(b, offset, currentBlock, upto, blockLeft); + blocks.add(currentBlock); + blockEnd.add(blockSize); + currentBlock = new byte[blockSize]; + upto = 0; + offset += blockLeft; + } else { + // Last block + System.arraycopy(b, offset, currentBlock, upto, left); + upto += left; + break; + } + } + } + + /** Return the current byte position. */ + public long getPosition() { + return getPointer(); + } + } + + /** Returns a DataInput to read values from this + * PagedBytes instance. */ + public PagedBytesDataInput getDataInput() { + if (!frozen) { + throw new IllegalStateException("must call freeze() before getDataInput"); + } + return new PagedBytesDataInput(); + } + + /** Returns a DataOutput that you may use to write into + * this PagedBytes instance. If you do this, you should + * not call the other writing methods (eg, copy); + * results are undefined. */ + public PagedBytesDataOutput getDataOutput() { + if (frozen) { + throw new IllegalStateException("cannot get DataOutput after freeze()"); + } + return new PagedBytesDataOutput(); + } } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestLucene40DocValuesFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestLucene40DocValuesFormat.java index f3242c9ebcc..cab6db2ffb2 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestLucene40DocValuesFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestLucene40DocValuesFormat.java @@ -30,5 +30,11 @@ public class TestLucene40DocValuesFormat extends BaseDocValuesFormatTestCase { protected Codec getCodec() { return codec; } - + + // LUCENE-4583: This codec should throw IAE on huge binary values: + @Override + protected boolean codecAcceptsHugeBinaryValues(String field) { + return false; + } + } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene42/TestLucene42DocValuesFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene42/TestLucene42DocValuesFormat.java index 27be9f25a68..d86002eb0af 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene42/TestLucene42DocValuesFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene42/TestLucene42DocValuesFormat.java @@ -30,4 +30,9 @@ public class TestLucene42DocValuesFormat extends BaseCompressingDocValuesFormatT protected Codec getCodec() { return codec; } + + @Override + protected boolean codecAcceptsHugeBinaryValues(String field) { + return false; + } } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldDocValuesFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldDocValuesFormat.java index e37ab87a687..06795f005c6 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldDocValuesFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/perfield/TestPerFieldDocValuesFormat.java @@ -46,6 +46,7 @@ import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util._TestUtil; /** * Basic tests of PerFieldDocValuesFormat @@ -63,6 +64,11 @@ public class TestPerFieldDocValuesFormat extends BaseDocValuesFormatTestCase { protected Codec getCodec() { return codec; } + + @Override + protected boolean codecAcceptsHugeBinaryValues(String field) { + return _TestUtil.fieldSupportsHugeBinaryDocValues(field); + } // just a simple trivial test // TODO: we should come up with a test that somehow checks that segment suffix diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDocValuesFormat.java b/lucene/core/src/test/org/apache/lucene/index/TestDocValuesFormat.java index 318fb9035c5..6bc3554bc03 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDocValuesFormat.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDocValuesFormat.java @@ -18,6 +18,7 @@ package org.apache.lucene.index; */ import org.apache.lucene.codecs.Codec; +import org.apache.lucene.util._TestUtil; /** Tests the codec configuration defined by LuceneTestCase randomly * (typically a mix across different fields). @@ -28,4 +29,9 @@ public class TestDocValuesFormat extends BaseDocValuesFormatTestCase { protected Codec getCodec() { return Codec.getDefault(); } + + @Override + protected boolean codecAcceptsHugeBinaryValues(String field) { + return _TestUtil.fieldSupportsHugeBinaryDocValues(field); + } } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDocValuesIndexing.java b/lucene/core/src/test/org/apache/lucene/index/TestDocValuesIndexing.java index a45818256c7..80019a14f70 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDocValuesIndexing.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDocValuesIndexing.java @@ -326,31 +326,7 @@ public class TestDocValuesIndexing extends LuceneTestCase { iwriter.close(); directory.close(); } - - public void testTooLargeBytes() throws IOException { - Analyzer analyzer = new MockAnalyzer(random()); - Directory directory = newDirectory(); - // we don't use RandomIndexWriter because it might add more docvalues than we expect !!!!1 - IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); - iwc.setMergePolicy(newLogMergePolicy()); - IndexWriter iwriter = new IndexWriter(directory, iwc); - Document doc = new Document(); - byte bytes[] = new byte[100000]; - BytesRef b = new BytesRef(bytes); - random().nextBytes(bytes); - doc.add(new BinaryDocValuesField("dv", b)); - try { - iwriter.addDocument(doc); - fail("did not get expected exception"); - } catch (IllegalArgumentException expected) { - // expected - } - iwriter.close(); - - directory.close(); - } - public void testTooLargeSortedBytes() throws IOException { Analyzer analyzer = new MockAnalyzer(random()); diff --git a/lucene/core/src/test/org/apache/lucene/util/TestPagedBytes.java b/lucene/core/src/test/org/apache/lucene/util/TestPagedBytes.java index 3d4dacdb8d9..801f2467aed 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestPagedBytes.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestPagedBytes.java @@ -22,6 +22,7 @@ import java.util.*; import org.apache.lucene.store.BaseDirectoryWrapper; import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; @@ -30,6 +31,9 @@ import org.junit.Ignore; public class TestPagedBytes extends LuceneTestCase { + // Writes random byte/s to "normal" file in dir, then + // copies into PagedBytes and verifies with + // PagedBytes.Reader: public void testDataInputOutput() throws Exception { Random random = random(); for(int iter=0;iter<5*RANDOM_MULTIPLIER;iter++) { @@ -90,6 +94,60 @@ public class TestPagedBytes extends LuceneTestCase { } } + // Writes random byte/s into PagedBytes via + // .getDataOutput(), then verifies with + // PagedBytes.getDataInput(): + public void testDataInputOutput2() throws Exception { + Random random = random(); + for(int iter=0;iter<5*RANDOM_MULTIPLIER;iter++) { + final int blockBits = _TestUtil.nextInt(random, 1, 20); + final int blockSize = 1 << blockBits; + final PagedBytes p = new PagedBytes(blockBits); + final DataOutput out = p.getDataOutput(); + final int numBytes = random().nextInt(10000000); + + final byte[] answer = new byte[numBytes]; + random().nextBytes(answer); + int written = 0; + while(written < numBytes) { + if (random().nextInt(10) == 7) { + out.writeByte(answer[written++]); + } else { + int chunk = Math.min(random().nextInt(1000), numBytes - written); + out.writeBytes(answer, written, chunk); + written += chunk; + } + } + + final PagedBytes.Reader reader = p.freeze(random.nextBoolean()); + + final DataInput in = p.getDataInput(); + + final byte[] verify = new byte[numBytes]; + int read = 0; + while(read < numBytes) { + if (random().nextInt(10) == 7) { + verify[read++] = in.readByte(); + } else { + int chunk = Math.min(random().nextInt(1000), numBytes - read); + in.readBytes(verify, read, chunk); + read += chunk; + } + } + assertTrue(Arrays.equals(answer, verify)); + + final BytesRef slice = new BytesRef(); + for(int iter2=0;iter2<100;iter2++) { + final int pos = random.nextInt(numBytes-1); + final int len = random.nextInt(Math.min(blockSize+1, numBytes - pos)); + reader.fillSlice(slice, pos, len); + for(int byteUpto=0;byteUpto 32 KB for one + // document, we don't hit exc when using Facet42DocValuesFormat + public void testManyFacetsInOneDocument() throws Exception { + Directory dir = newDirectory(); + Directory taxoDir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + iwc.setCodec(new Facet42Codec()); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc); + DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE); + + FacetFields facetFields = new FacetFields(taxoWriter); + + int numLabels = _TestUtil.nextInt(random(), 40000, 100000); + + Document doc = new Document(); + doc.add(newTextField("field", "text", Field.Store.NO)); + List paths = new ArrayList(); + for(int i=0;i results = c.getFacetResults(); + assertEquals(1, results.size()); + FacetResultNode root = results.get(0).getFacetResultNode(); + assertEquals(numLabels, root.subResults.size()); + Set allLabels = new HashSet(); + for(FacetResultNode childNode : root.subResults) { + assertEquals(2, childNode.label.length); + allLabels.add(childNode.label.components[1]); + assertEquals(1, (int) childNode.value); + } + assertEquals(numLabels, allLabels.size()); + + IOUtils.close(searcher.getIndexReader(), taxoReader, dir, taxoDir); + } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesWriter.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesWriter.java index 15b3081c848..90c161cf696 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesWriter.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/lucene40/Lucene40DocValuesWriter.java @@ -162,6 +162,9 @@ class Lucene40DocValuesWriter extends DocValuesConsumer { if (b == null) { b = new BytesRef(); // 4.0 doesnt distinguish } + if (b.length > Lucene40DocValuesFormat.MAX_BINARY_FIELD_LENGTH) { + throw new IllegalArgumentException("DocValuesField \"" + field.name + "\" is too large, must be <= " + Lucene40DocValuesFormat.MAX_BINARY_FIELD_LENGTH); + } minLength = Math.min(minLength, b.length); maxLength = Math.max(maxLength, b.length); if (uniqueValues != null) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java index 7dd38ae1c69..008eeea7d32 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java @@ -17,21 +17,20 @@ package org.apache.lucene.index; * limitations under the License. */ -import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS; - import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; -import java.util.Map; import java.util.Map.Entry; +import java.util.Map; import java.util.Set; import java.util.TreeSet; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.lucene42.Lucene42DocValuesFormat; import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -58,6 +57,8 @@ import org.apache.lucene.util.BytesRefHash; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util._TestUtil; +import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS; + /** * Abstract class to do basic tests for a docvalues format. * NOTE: This test focuses on the docvalues impl, nothing else. @@ -2401,4 +2402,172 @@ public abstract class BaseDocValuesFormatTestCase extends LuceneTestCase { directory.close(); } + // LUCENE-4853 + public void testHugeBinaryValues() throws Exception { + Analyzer analyzer = new MockAnalyzer(random()); + // FSDirectory because SimpleText will consume gobbs of + // space when storing big binary values: + Directory d = newFSDirectory(_TestUtil.getTempDir("hugeBinaryValues")); + boolean doFixed = random().nextBoolean(); + int numDocs; + int fixedLength = 0; + if (doFixed) { + // Sometimes make all values fixed length since some + // codecs have different code paths for this: + numDocs = _TestUtil.nextInt(random(), 10, 20); + fixedLength = _TestUtil.nextInt(random(), 65537, 256*1024); + } else { + numDocs = _TestUtil.nextInt(random(), 100, 200); + } + IndexWriter w = new IndexWriter(d, newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); + List docBytes = new ArrayList(); + long totalBytes = 0; + for(int docID=0;docID 64KB in size to ensure more than 2 pages in + // PagedBytes would be needed: + int numBytes; + if (doFixed) { + numBytes = fixedLength; + } else if (docID == 0 || random().nextInt(5) == 3) { + numBytes = _TestUtil.nextInt(random(), 65537, 3*1024*1024); + } else { + numBytes = _TestUtil.nextInt(random(), 1, 1024*1024); + } + totalBytes += numBytes; + if (totalBytes > 5 * 1024*1024) { + break; + } + byte[] bytes = new byte[numBytes]; + random().nextBytes(bytes); + docBytes.add(bytes); + Document doc = new Document(); + BytesRef b = new BytesRef(bytes); + b.length = bytes.length; + doc.add(new BinaryDocValuesField("field", b)); + doc.add(new StringField("id", ""+docID, Field.Store.YES)); + try { + w.addDocument(doc); + } catch (IllegalArgumentException iae) { + if (iae.getMessage().indexOf("is too large") == -1) { + throw iae; + } else { + // OK: some codecs can't handle binary DV > 32K + assertFalse(codecAcceptsHugeBinaryValues("field")); + w.rollback(); + d.close(); + return; + } + } + } + + DirectoryReader r; + try { + r = w.getReader(); + } catch (IllegalArgumentException iae) { + if (iae.getMessage().indexOf("is too large") == -1) { + throw iae; + } else { + assertFalse(codecAcceptsHugeBinaryValues("field")); + + // OK: some codecs can't handle binary DV > 32K + w.rollback(); + d.close(); + return; + } + } + w.close(); + + AtomicReader ar = SlowCompositeReaderWrapper.wrap(r); + + BinaryDocValues s = FieldCache.DEFAULT.getTerms(ar, "field"); + for(int docID=0;docID docBytes = new ArrayList(); + long totalBytes = 0; + for(int docID=0;docID 64KB in size to ensure more than 2 pages in + // PagedBytes would be needed: + int numBytes; + if (doFixed) { + numBytes = fixedLength; + } else if (docID == 0 || random().nextInt(5) == 3) { + numBytes = Lucene42DocValuesFormat.MAX_BINARY_FIELD_LENGTH; + } else { + numBytes = _TestUtil.nextInt(random(), 1, Lucene42DocValuesFormat.MAX_BINARY_FIELD_LENGTH); + } + totalBytes += numBytes; + if (totalBytes > 5 * 1024*1024) { + break; + } + byte[] bytes = new byte[numBytes]; + random().nextBytes(bytes); + docBytes.add(bytes); + Document doc = new Document(); + BytesRef b = new BytesRef(bytes); + b.length = bytes.length; + doc.add(new BinaryDocValuesField("field", b)); + doc.add(new StringField("id", ""+docID, Field.Store.YES)); + w.addDocument(doc); + } + + DirectoryReader r = w.getReader(); + w.close(); + + AtomicReader ar = SlowCompositeReaderWrapper.wrap(r); + + BinaryDocValues s = FieldCache.DEFAULT.getTerms(ar, "field"); + for(int docID=0;docID 1 || (files.length == 1 && !files[0].equals("write.lock"))) { diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 1cd35a06254..31503e157c3 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -129,6 +129,8 @@ Bug Fixes * SOLR-5122: Fixed bug in spellcheck.collateMaxCollectDocs. Eliminates risk of divide by zero, and makes estimated hit counts meaningful in non-optimized indexes. (hossman) + +* SOLR-5164: Can not create a collection via collections API (cloud mode) (Erick Erickson) Optimizations ---------------------- diff --git a/solr/core/src/java/org/apache/solr/core/CorePropertiesLocator.java b/solr/core/src/java/org/apache/solr/core/CorePropertiesLocator.java index 5db0dfcb9a8..087a37b3f6f 100644 --- a/solr/core/src/java/org/apache/solr/core/CorePropertiesLocator.java +++ b/solr/core/src/java/org/apache/solr/core/CorePropertiesLocator.java @@ -31,7 +31,6 @@ import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.Writer; -import java.util.Date; import java.util.List; import java.util.Properties; @@ -78,6 +77,7 @@ public class CorePropertiesLocator implements CoresLocator { Properties p = buildCoreProperties(cd); Writer os = null; try { + propfile.getParentFile().mkdirs(); os = new OutputStreamWriter(new FileOutputStream(propfile), Charsets.UTF_8); p.store(os, "Written by CorePropertiesLocator"); } diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java index bd6d1511f27..8492996d1f3 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CoreAdminHandler.java @@ -409,8 +409,9 @@ public class CoreAdminHandler extends RequestHandlerBase { String name = checkNotEmpty(params.get(CoreAdminParams.NAME), "Missing parameter [" + CoreAdminParams.NAME + "]"); String instancedir = params.get(CoreAdminParams.INSTANCE_DIR); - if (StringUtils.isEmpty(instancedir)) - instancedir = container.getSolrHome() + File.separator + name; + if (StringUtils.isEmpty(instancedir)) { + instancedir = name; // Already relative to solrHome, we haven't been given an absolute path. + } Properties coreProps = new Properties(); for (String param : paramToProp.keySet()) {