LUCENE-3441: fix bug in ParentArray.initFromReader

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1412235 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shai Erera 2012-11-21 19:19:54 +00:00
parent cb1a232def
commit da8b5065cf
2 changed files with 79 additions and 42 deletions

View File

@ -3,11 +3,10 @@ package org.apache.lucene.facet.taxonomy.directory;
import java.io.IOException;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.ArrayUtil;
@ -58,9 +57,12 @@ class ParentArray {
public ParentArray(IndexReader reader, ParentArray copyFrom) throws IOException {
assert copyFrom != null;
// note that copyParents.length may be equal to reader.maxDoc(). this is not a bug
// it may be caused if e.g. the taxonomy segments were merged, and so an updated
// NRT reader was obtained, even though nothing was changed. this is not very likely
// to happen.
int[] copyParents = copyFrom.getArray();
assert copyParents.length < reader.maxDoc() : "do not init a new ParentArray if the index hasn't changed";
this.parentOrdinals = new int[reader.maxDoc()];
System.arraycopy(copyParents, 0, parentOrdinals, 0, copyParents.length);
initFromReader(reader, copyParents.length);
@ -72,47 +74,36 @@ class ParentArray {
return;
}
TermsEnum termsEnum = null;
DocsAndPositionsEnum positions = null;
int idx = 0;
for (AtomicReaderContext context : reader.leaves()) {
if (context.docBase < first) {
continue;
}
// it's ok to use MultiFields because we only iterate on one posting list.
// breaking it to loop over the leaves() only complicates code for no
// apparent gain.
DocsAndPositionsEnum positions = MultiFields.getTermPositionsEnum(reader, null,
Consts.FIELD_PAYLOADS, Consts.PAYLOAD_PARENT_BYTES_REF,
DocsAndPositionsEnum.FLAG_PAYLOADS);
// in general we could call readerCtx.reader().termPositionsEnum(), but that
// passes the liveDocs. Since we know there are no deletions, the code
// below may save some CPU cycles.
termsEnum = context.reader().fields().terms(Consts.FIELD_PAYLOADS).iterator(termsEnum);
if (!termsEnum.seekExact(Consts.PAYLOAD_PARENT_BYTES_REF, true)) {
throw new CorruptIndexException("Missing parent stream data for segment " + context.reader());
}
positions = termsEnum.docsAndPositions(null /* no deletes in taxonomy */, positions);
if (positions == null) {
throw new CorruptIndexException("Missing parent stream data for segment " + context.reader());
}
idx = context.docBase;
int doc;
while ((doc = positions.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
doc += context.docBase;
if (doc == idx) {
if (positions.freq() == 0) { // shouldn't happen
throw new CorruptIndexException("Missing parent data for category " + idx);
}
parentOrdinals[idx++] = positions.nextPosition();
} else { // this shouldn't happen
throw new CorruptIndexException("Missing parent data for category " + idx);
}
}
if (idx + 1 < context.reader().maxDoc()) {
throw new CorruptIndexException("Missing parent data for category " + (idx + 1));
}
// shouldn't really happen, if it does, something's wrong
if (positions == null || positions.advance(first) == DocIdSetIterator.NO_MORE_DOCS) {
throw new CorruptIndexException("Missing parent data for category " + first);
}
if (idx != reader.maxDoc()) {
throw new CorruptIndexException("Missing parent data for category " + idx);
int num = reader.maxDoc();
for (int i = first; i < num; i++) {
if (positions.docID() == i) {
if (positions.freq() == 0) { // shouldn't happen
throw new CorruptIndexException("Missing parent data for category " + i);
}
parentOrdinals[i] = positions.nextPosition();
if (positions.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
if (i + 1 < num) {
throw new CorruptIndexException("Missing parent data for category "+ (i + 1));
}
break;
}
} else { // this shouldn't happen
throw new CorruptIndexException("Missing parent data for category " + i);
}
}
}

View File

@ -1,11 +1,15 @@
package org.apache.lucene.facet.taxonomy.directory;
import java.io.IOException;
import java.util.Random;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LogByteSizeMergePolicy;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.store.AlreadyClosedException;
@ -242,6 +246,48 @@ public class TestDirectoryTaxonomyReader extends LuceneTestCase {
dir.close();
}
@Test
public void testOpenIfChangedMergedSegment() throws Exception {
// test openIfChanged() when all index segments were merged - used to be
// a bug in ParentArray, caught by testOpenIfChangedManySegments - only
// this test is not random
Directory dir = newDirectory();
// hold onto IW to forceMerge
// note how we don't close it, since DTW will close it.
final IndexWriter iw = new IndexWriter(dir,
new IndexWriterConfig(TEST_VERSION_CURRENT, new KeywordAnalyzer())
.setMergePolicy(new LogByteSizeMergePolicy()));
DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir) {
@Override
protected IndexWriter openIndexWriter(Directory directory,
IndexWriterConfig config) throws IOException {
return iw;
}
};
TaxonomyReader reader = new DirectoryTaxonomyReader(writer);
assertEquals(1, reader.getSize());
assertEquals(1, reader.getParentArray().length);
// add category and call forceMerge -- this should flush IW and merge segments down to 1
// in ParentArray.initFromReader, this used to fail assuming there are no parents.
writer.addCategory(new CategoryPath("1"));
iw.forceMerge(1);
// now calling openIfChanged should trip on the bug
TaxonomyReader newtr = TaxonomyReader.openIfChanged(reader);
assertNotNull(newtr);
reader.close();
reader = newtr;
assertEquals(2, reader.getSize());
assertEquals(2, reader.getParentArray().length);
reader.close();
writer.close();
dir.close();
}
@Test
public void testOpenIfChangedReuseAfterRecreate() throws Exception {
// tests that if the taxonomy is recreated, no data is reused from the previous taxonomy