From 8eea72d476d96cbc8eeb4e47cfd1c674b9f49535 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Mon, 23 Nov 2015 19:05:31 +0000 Subject: [PATCH] LUCENE-6906: Fix Lucene54DocValuesFormat on large empty segments. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1715918 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene54/Lucene54DocValuesProducer.java | 2 +- .../util/packed/DirectMonotonicWriter.java | 4 +- .../lucene54/TestLucene54DocValuesFormat.java | 9 +- .../util/packed/TestDirectMonotonic.java | 94 +++++++--- .../index/BaseDocValuesFormatTestCase.java | 166 ++++++++++++++++++ 5 files changed, 241 insertions(+), 34 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesProducer.java index 5fac6ed3d24..62164cd37fb 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesProducer.java @@ -319,7 +319,7 @@ final class Lucene54DocValuesProducer extends DocValuesProducer implements Close // sparse bits need a bit more metadata entry.numDocsWithValue = meta.readVLong(); final int blockShift = meta.readVInt(); - entry.monotonicMeta = DirectMonotonicReader.loadMeta(meta, entry.numDocsWithValue + 1, blockShift); + entry.monotonicMeta = DirectMonotonicReader.loadMeta(meta, entry.numDocsWithValue, blockShift); ramBytesUsed.addAndGet(entry.monotonicMeta.ramBytesUsed()); directAddressesMeta.put(info.name, entry.monotonicMeta); } diff --git a/lucene/core/src/java/org/apache/lucene/util/packed/DirectMonotonicWriter.java b/lucene/core/src/java/org/apache/lucene/util/packed/DirectMonotonicWriter.java index 820856e6058..1ff5dedfb61 100644 --- a/lucene/core/src/java/org/apache/lucene/util/packed/DirectMonotonicWriter.java +++ b/lucene/core/src/java/org/apache/lucene/util/packed/DirectMonotonicWriter.java @@ -123,7 +123,9 @@ public final class DirectMonotonicWriter { if (finished) { throw new IllegalStateException("#finish has been called already"); } - flush(); + if (bufferSize > 0) { + flush(); + } finished = true; } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene54/TestLucene54DocValuesFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene54/TestLucene54DocValuesFormat.java index 2027ee8eb20..9a010b6649c 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene54/TestLucene54DocValuesFormat.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene54/TestLucene54DocValuesFormat.java @@ -17,6 +17,7 @@ package org.apache.lucene.codecs.lucene54; * limitations under the License. */ +import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -24,6 +25,7 @@ import java.util.HashSet; import java.util.List; import java.util.Set; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.DocValuesFormat; @@ -138,7 +140,7 @@ public class TestLucene54DocValuesFormat extends BaseCompressingDocValuesFormatT @Slow public void testSparseDocValuesVsStoredFields() throws Exception { - int numIterations = atLeast(2); + int numIterations = atLeast(1); for (int i = 0; i < numIterations; i++) { doTestSparseDocValuesVsStoredFields(); } @@ -158,7 +160,7 @@ public class TestLucene54DocValuesFormat extends BaseCompressingDocValuesFormatT // sparse compression is only enabled if less than 1% of docs have a value final int avgGap = 100; - final int numDocs = atLeast(100); + final int numDocs = atLeast(200); for (int i = random().nextInt(avgGap * 2); i >= 0; --i) { writer.addDocument(new Document()); } @@ -185,7 +187,7 @@ public class TestLucene54DocValuesFormat extends BaseCompressingDocValuesFormatT writer.addDocument(doc); // add a gap - for (int j = random().nextInt(avgGap * 2); j >= 0; --j) { + for (int j = TestUtil.nextInt(random(), 0, avgGap * 2); j >= 0; --j) { writer.addDocument(new Document()); } } @@ -502,4 +504,5 @@ public class TestLucene54DocValuesFormat extends BaseCompressingDocValuesFormatT } } } + } diff --git a/lucene/core/src/test/org/apache/lucene/util/packed/TestDirectMonotonic.java b/lucene/core/src/test/org/apache/lucene/util/packed/TestDirectMonotonic.java index 8bcd1e4d2a9..159ceb99b23 100644 --- a/lucene/core/src/test/org/apache/lucene/util/packed/TestDirectMonotonic.java +++ b/lucene/core/src/test/org/apache/lucene/util/packed/TestDirectMonotonic.java @@ -32,6 +32,28 @@ import org.apache.lucene.util.TestUtil; public class TestDirectMonotonic extends LuceneTestCase { + public void testEmpty() throws IOException { + Directory dir = newDirectory(); + final int blockShift = TestUtil.nextInt(random(), DirectMonotonicWriter.MIN_BLOCK_SHIFT, DirectMonotonicWriter.MAX_BLOCK_SHIFT); + + final long dataLength; + try (IndexOutput metaOut = dir.createOutput("meta", IOContext.DEFAULT); + IndexOutput dataOut = dir.createOutput("data", IOContext.DEFAULT)) { + DirectMonotonicWriter w = DirectMonotonicWriter.getInstance(metaOut, dataOut, 0, blockShift); + w.finish(); + dataLength = dataOut.getFilePointer(); + } + + try (IndexInput metaIn = dir.openInput("meta", IOContext.READONCE); + IndexInput dataIn = dir.openInput("data", IOContext.DEFAULT)) { + DirectMonotonicReader.Meta meta = DirectMonotonicReader.loadMeta(metaIn, 0, blockShift); + DirectMonotonicReader.getInstance(meta, dataIn.randomAccessSlice(0, dataLength)); + // no exception + } + + dir.close(); + } + public void testSimple() throws IOException { Directory dir = newDirectory(); final int blockShift = 2; @@ -100,38 +122,52 @@ public class TestDirectMonotonic extends LuceneTestCase { } public void testRandom() throws IOException { - Directory dir = newDirectory(); - final int blockShift = TestUtil.nextInt(random(), DirectMonotonicWriter.MIN_BLOCK_SHIFT, DirectMonotonicWriter.MAX_BLOCK_SHIFT); - final int numValues = TestUtil.nextInt(random(), 1, 1 << 20); - List actualValues = new ArrayList<>(); - long previous = random().nextLong(); - actualValues.add(previous); - for (int i = 1; i < numValues; ++i) { - previous += random().nextInt(1 << random().nextInt(20)); - actualValues.add(previous); - } - - final long dataLength; - try (IndexOutput metaOut = dir.createOutput("meta", IOContext.DEFAULT); - IndexOutput dataOut = dir.createOutput("data", IOContext.DEFAULT)) { - DirectMonotonicWriter w = DirectMonotonicWriter.getInstance(metaOut, dataOut, numValues, blockShift); - for (long v : actualValues) { - w.add(v); + final int iters = atLeast(3); + for (int iter = 0; iter < iters; ++iter) { + Directory dir = newDirectory(); + final int blockShift = TestUtil.nextInt(random(), DirectMonotonicWriter.MIN_BLOCK_SHIFT, DirectMonotonicWriter.MAX_BLOCK_SHIFT); + final int maxNumValues = 1 << 20; + final int numValues; + if (random().nextBoolean()) { + // random number + numValues = TestUtil.nextInt(random(), 1, maxNumValues); + } else { + // multiple of the block size + final int numBlocks = TestUtil.nextInt(random(), 0, maxNumValues >>> blockShift); + numValues = TestUtil.nextInt(random(), 0, numBlocks) << blockShift; } - w.finish(); - dataLength = dataOut.getFilePointer(); - } - - try (IndexInput metaIn = dir.openInput("meta", IOContext.READONCE); - IndexInput dataIn = dir.openInput("data", IOContext.DEFAULT)) { - DirectMonotonicReader.Meta meta = DirectMonotonicReader.loadMeta(metaIn, numValues, blockShift); - LongValues values = DirectMonotonicReader.getInstance(meta, dataIn.randomAccessSlice(0, dataLength)); - for (int i = 0; i < numValues; ++i) { - assertEquals(actualValues.get(i).longValue(), values.get(i)); + List actualValues = new ArrayList<>(); + long previous = random().nextLong(); + if (numValues > 0) { + actualValues.add(previous); } + for (int i = 1; i < numValues; ++i) { + previous += random().nextInt(1 << random().nextInt(20)); + actualValues.add(previous); + } + + final long dataLength; + try (IndexOutput metaOut = dir.createOutput("meta", IOContext.DEFAULT); + IndexOutput dataOut = dir.createOutput("data", IOContext.DEFAULT)) { + DirectMonotonicWriter w = DirectMonotonicWriter.getInstance(metaOut, dataOut, numValues, blockShift); + for (long v : actualValues) { + w.add(v); + } + w.finish(); + dataLength = dataOut.getFilePointer(); + } + + try (IndexInput metaIn = dir.openInput("meta", IOContext.READONCE); + IndexInput dataIn = dir.openInput("data", IOContext.DEFAULT)) { + DirectMonotonicReader.Meta meta = DirectMonotonicReader.loadMeta(metaIn, numValues, blockShift); + LongValues values = DirectMonotonicReader.getInstance(meta, dataIn.randomAccessSlice(0, dataLength)); + for (int i = 0; i < numValues; ++i) { + assertEquals(actualValues.get(i).longValue(), values.get(i)); + } + } + + dir.close(); } - - dir.close(); } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java index 4abe979e29f..3aa0f3b196f 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java @@ -3148,6 +3148,172 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes assertEquals(term2.get(), enum2.term()); } + // same as testSortedMergeAwayAllValues but on more than 1024 docs to have sparse encoding on + public void testSortedMergeAwayAllValuesLargeSegment() throws IOException { + Directory directory = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); + iwconfig.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); + + Document doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.NO)); + doc.add(new SortedDocValuesField("field", new BytesRef("hello"))); + iwriter.addDocument(doc); + final int numEmptyDocs = atLeast(1024); + for (int i = 0; i < numEmptyDocs; ++i) { + iwriter.addDocument(new Document()); + } + iwriter.commit(); + iwriter.deleteDocuments(new Term("id", "1")); + iwriter.forceMerge(1); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + SortedDocValues dv = getOnlySegmentReader(ireader).getSortedDocValues("field"); + for (int i = 0; i < numEmptyDocs; ++i) { + assertEquals(-1, dv.getOrd(i)); + } + + ireader.close(); + directory.close(); + } + + // same as testSortedSetMergeAwayAllValues but on more than 1024 docs to have sparse encoding on + public void testSortedSetMergeAwayAllValuesLargeSegment() throws IOException { + Directory directory = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); + iwconfig.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); + + Document doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.NO)); + doc.add(new SortedSetDocValuesField("field", new BytesRef("hello"))); + iwriter.addDocument(doc); + final int numEmptyDocs = atLeast(1024); + for (int i = 0; i < numEmptyDocs; ++i) { + iwriter.addDocument(new Document()); + } + iwriter.commit(); + iwriter.deleteDocuments(new Term("id", "1")); + iwriter.forceMerge(1); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + SortedSetDocValues dv = getOnlySegmentReader(ireader).getSortedSetDocValues("field"); + for (int i = 0; i < numEmptyDocs; ++i) { + dv.setDocument(i); + assertEquals(-1L, dv.nextOrd()); + } + + ireader.close(); + directory.close(); + } + + // same as testNumericMergeAwayAllValues but on more than 1024 docs to have sparse encoding on + public void testNumericMergeAwayAllValuesLargeSegment() throws IOException { + Directory directory = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); + iwconfig.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); + + Document doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.NO)); + doc.add(new NumericDocValuesField("field", 42L)); + iwriter.addDocument(doc); + final int numEmptyDocs = atLeast(1024); + for (int i = 0; i < numEmptyDocs; ++i) { + iwriter.addDocument(new Document()); + } + iwriter.commit(); + iwriter.deleteDocuments(new Term("id", "1")); + iwriter.forceMerge(1); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + NumericDocValues dv = getOnlySegmentReader(ireader).getNumericDocValues("field"); + Bits docsWithField = getOnlySegmentReader(ireader).getDocsWithField("field"); + for (int i = 0; i < numEmptyDocs; ++i) { + assertEquals(0, dv.get(i)); + assertFalse(docsWithField.get(i)); + } + + ireader.close(); + directory.close(); + } + + // same as testSortedNumericMergeAwayAllValues but on more than 1024 docs to have sparse encoding on + public void testSortedNumericMergeAwayAllValuesLargeSegment() throws IOException { + Directory directory = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); + iwconfig.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); + + Document doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.NO)); + doc.add(new SortedNumericDocValuesField("field", 42L)); + iwriter.addDocument(doc); + final int numEmptyDocs = atLeast(1024); + for (int i = 0; i < numEmptyDocs; ++i) { + iwriter.addDocument(new Document()); + } + iwriter.commit(); + iwriter.deleteDocuments(new Term("id", "1")); + iwriter.forceMerge(1); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + SortedNumericDocValues dv = getOnlySegmentReader(ireader).getSortedNumericDocValues("field"); + for (int i = 0; i < numEmptyDocs; ++i) { + dv.setDocument(i); + assertEquals(0, dv.count()); + } + + ireader.close(); + directory.close(); + } + + // same as testBinaryMergeAwayAllValues but on more than 1024 docs to have sparse encoding on + public void testBinaryMergeAwayAllValuesLargeSegment() throws IOException { + Directory directory = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig iwconfig = newIndexWriterConfig(analyzer); + iwconfig.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iwriter = new RandomIndexWriter(random(), directory, iwconfig); + + Document doc = new Document(); + doc.add(new StringField("id", "1", Field.Store.NO)); + doc.add(new BinaryDocValuesField("field", new BytesRef("hello"))); + iwriter.addDocument(doc); + final int numEmptyDocs = atLeast(1024); + for (int i = 0; i < numEmptyDocs; ++i) { + iwriter.addDocument(new Document()); + } + iwriter.commit(); + iwriter.deleteDocuments(new Term("id", "1")); + iwriter.forceMerge(1); + + DirectoryReader ireader = iwriter.getReader(); + iwriter.close(); + + BinaryDocValues dv = getOnlySegmentReader(ireader).getBinaryDocValues("field"); + Bits docsWithField = getOnlySegmentReader(ireader).getDocsWithField("field"); + for (int i = 0; i < numEmptyDocs; ++i) { + assertEquals(new BytesRef(), dv.get(i)); + assertFalse(docsWithField.get(i)); + } + + ireader.close(); + directory.close(); + } + protected boolean codecAcceptsHugeBinaryValues(String field) { return true; }