From fe938c64edde0daa629ecbb25655e595a8edff43 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Wed, 14 Aug 2013 16:43:22 +0000 Subject: [PATCH] LUCENE-5173: add checkindex piece of LUCENE-5116, prevent 0 document segments completely git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1513955 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 3 ++ .../org/apache/lucene/index/CheckIndex.java | 5 +++ .../org/apache/lucene/index/IndexWriter.java | 35 +++++++++++-------- .../apache/lucene/index/SegmentMerger.java | 13 +++++-- .../apache/lucene/index/TestIndexWriter.java | 30 ++++++++++++++++ .../lucene/index/RandomIndexWriter.java | 2 +- .../org/apache/lucene/util/_TestUtil.java | 4 +-- 7 files changed, 71 insertions(+), 21 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 7f170788833..87a8d963634 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -172,6 +172,9 @@ API Changes Analyzer. Analyzer adds a getter to retrieve the strategy for this use-case. (Uwe Schindler, Robert Muir, Shay Banon) +* LUCENE-5173: Lucene never writes segments with 0 documents anymore. + (Shai Erera, Uwe Schindler, Robert Muir) + Optimizations * LUCENE-5088: Added TermFilter to filter docs by a specific term. diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index 612fbc5d093..27a1aface60 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -496,6 +496,11 @@ public class CheckIndex { msg(infoStream, " " + (1+i) + " of " + numSegments + ": name=" + info.info.name + " docCount=" + info.info.getDocCount()); segInfoStat.name = info.info.name; segInfoStat.docCount = info.info.getDocCount(); + + final String version = info.info.getVersion(); + if (info.info.getDocCount() <= 0 && version != null && versionComparator.compare(version, "4.5") >= 0) { + throw new RuntimeException("illegal number of documents: maxDoc=" + info.info.getDocCount()); + } int toLoseDocCount = info.info.getDocCount(); diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index 93fd9ed45b5..6450708b003 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -2462,20 +2462,12 @@ public class IndexWriter implements Closeable, TwoPhaseCommit { String mergedName = newSegmentName(); final List mergeReaders = new ArrayList(); for (IndexReader indexReader : readers) { - if (indexReader.numDocs() > 0) { - numDocs += indexReader.numDocs(); - for (AtomicReaderContext ctx : indexReader.leaves()) { - if (ctx.reader().numDocs() > 0) { // drop empty (or all deleted) segments - mergeReaders.add(ctx.reader()); - } - } + numDocs += indexReader.numDocs(); + for (AtomicReaderContext ctx : indexReader.leaves()) { + mergeReaders.add(ctx.reader()); } } - if (mergeReaders.isEmpty()) { // no segments with documents to add - return; - } - final IOContext context = new IOContext(new MergeInfo(numDocs, -1, true, -1)); // TODO: somehow we should fix this merge so it's @@ -2487,6 +2479,10 @@ public class IndexWriter implements Closeable, TwoPhaseCommit { SegmentMerger merger = new SegmentMerger(mergeReaders, info, infoStream, trackingDir, MergeState.CheckAbort.NONE, globalFieldNumberMap, context); + + if (!merger.shouldMerge()) { + return; + } MergeState mergeState; boolean success = false; @@ -3733,7 +3729,12 @@ public class IndexWriter implements Closeable, TwoPhaseCommit { MergeState mergeState; boolean success3 = false; try { - mergeState = merger.merge(); + if (!merger.shouldMerge()) { + // would result in a 0 document segment: nothing to merge! + mergeState = new MergeState(new ArrayList(), merge.info.info, infoStream, checkAbort); + } else { + mergeState = merger.merge(); + } success3 = true; } finally { if (!success3) { @@ -3748,12 +3749,16 @@ public class IndexWriter implements Closeable, TwoPhaseCommit { // Record which codec was used to write the segment if (infoStream.isEnabled("IW")) { - infoStream.message("IW", "merge codec=" + codec + " docCount=" + merge.info.info.getDocCount() + "; merged segment has " + + if (merge.info.info.getDocCount() == 0) { + infoStream.message("IW", "merge away fully deleted segments"); + } else { + infoStream.message("IW", "merge codec=" + codec + " docCount=" + merge.info.info.getDocCount() + "; merged segment has " + (mergeState.fieldInfos.hasVectors() ? "vectors" : "no vectors") + "; " + (mergeState.fieldInfos.hasNorms() ? "norms" : "no norms") + "; " + (mergeState.fieldInfos.hasDocValues() ? "docValues" : "no docValues") + "; " + (mergeState.fieldInfos.hasProx() ? "prox" : "no prox") + "; " + (mergeState.fieldInfos.hasProx() ? "freqs" : "no freqs")); + } } // Very important to do this before opening the reader @@ -3958,8 +3963,8 @@ public class IndexWriter implements Closeable, TwoPhaseCommit { /** Only for testing. * * @lucene.internal */ - void keepFullyDeletedSegments() { - keepFullyDeletedSegments = true; + void setKeepFullyDeletedSegments(boolean v) { + keepFullyDeletedSegments = v; } boolean getKeepFullyDeletedSegments() { diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java b/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java index 4c1fdb9caff..f121e85b10f 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java @@ -52,12 +52,18 @@ final class SegmentMerger { // note, just like in codec apis Directory 'dir' is NOT the same as segmentInfo.dir!! SegmentMerger(List readers, SegmentInfo segmentInfo, InfoStream infoStream, Directory dir, - MergeState.CheckAbort checkAbort, FieldInfos.FieldNumbers fieldNumbers, IOContext context) { + MergeState.CheckAbort checkAbort, FieldInfos.FieldNumbers fieldNumbers, IOContext context) throws IOException { mergeState = new MergeState(readers, segmentInfo, infoStream, checkAbort); directory = dir; this.codec = segmentInfo.getCodec(); this.context = context; this.fieldInfosBuilder = new FieldInfos.Builder(fieldNumbers); + mergeState.segmentInfo.setDocCount(setDocMaps()); + } + + /** True if any merging should happen */ + boolean shouldMerge() { + return mergeState.segmentInfo.getDocCount() > 0; } /** @@ -67,14 +73,15 @@ final class SegmentMerger { * @throws IOException if there is a low-level IO error */ MergeState merge() throws IOException { + if (!shouldMerge()) { + throw new IllegalStateException("Merge would result in 0 document segment"); + } // NOTE: it's important to add calls to // checkAbort.work(...) if you make any changes to this // method that will spend alot of time. The frequency // of this check impacts how long // IndexWriter.close(false) takes to actually stop the // threads. - - mergeState.segmentInfo.setDocCount(setDocMaps()); mergeFieldInfos(); setMatchingSegmentReaders(); long t0 = 0; diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index 1db3d81b830..34e41dcf3cd 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -49,6 +49,7 @@ import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.AlreadyClosedException; @@ -68,6 +69,7 @@ import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.SetOnce; import org.apache.lucene.util.ThreadInterruptedException; import org.apache.lucene.util._TestUtil; import org.apache.lucene.util.packed.PackedInts; @@ -2188,4 +2190,32 @@ public class TestIndexWriter extends LuceneTestCase { writer.close(); dir.close(); } + + public void testMergeAllDeleted() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + final SetOnce iwRef = new SetOnce(); + iwc.setInfoStream(new RandomIndexWriter.TestPointInfoStream(iwc.getInfoStream(), new RandomIndexWriter.TestPoint() { + @Override + public void apply(String message) { + if ("startCommitMerge".equals(message)) { + iwRef.get().setKeepFullyDeletedSegments(false); + } else if ("startMergeInit".equals(message)) { + iwRef.get().setKeepFullyDeletedSegments(true); + } + } + })); + IndexWriter evilWriter = new IndexWriter(dir, iwc); + iwRef.set(evilWriter); + for (int i = 0; i < 1000; i++) { + addDoc(evilWriter); + if (random().nextInt(17) == 0) { + evilWriter.commit(); + } + } + evilWriter.deleteDocuments(new MatchAllDocsQuery()); + evilWriter.forceMerge(1); + evilWriter.close(); + dir.close(); + } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/RandomIndexWriter.java b/lucene/test-framework/src/java/org/apache/lucene/index/RandomIndexWriter.java index fb67b4428c3..7bdbc65066a 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomIndexWriter.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomIndexWriter.java @@ -340,7 +340,7 @@ public class RandomIndexWriter implements Closeable { w.forceMerge(maxSegmentCount); } - private static final class TestPointInfoStream extends InfoStream { + static final class TestPointInfoStream extends InfoStream { private final InfoStream delegate; private final TestPoint testPoint; diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java index e0ef230f91d..c5dea7fae9c 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java @@ -793,9 +793,9 @@ public class _TestUtil { try { // Carefully invoke what is a package-private (test // only, internal) method on IndexWriter: - Method m = IndexWriter.class.getDeclaredMethod("keepFullyDeletedSegments"); + Method m = IndexWriter.class.getDeclaredMethod("setKeepFullyDeletedSegments", boolean.class); m.setAccessible(true); - m.invoke(w); + m.invoke(w, Boolean.TRUE); } catch (Exception e) { // Should not happen? throw new RuntimeException(e);