LUCENE-2144: fix InstantiatedIndex to handle termDocs(null)

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@889431 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2009-12-10 21:38:07 +00:00
parent 74bb5791f9
commit 121dbb58ba
8 changed files with 267 additions and 131 deletions

View File

@ -23,6 +23,10 @@ Bug fixes
will be replaced by U+FFFD by the indexer, but not at query time. The filter
now reverses supplementary characters correctly if used with Version > 3.0.
(Simon Willnauer, Robert Muir)
* LUCENE-2144: Fix InstantiatedIndex to handle termDocs(null)
correctly (enumerate all non-deleted docs). (Karl Wettin via Mike
McCandless)
API Changes

View File

@ -35,6 +35,7 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.util.BitVector;
/**
* Represented as a coupled graph of class instances, this
@ -61,8 +62,7 @@ public class InstantiatedIndex
private InstantiatedDocument[] documentsByNumber;
/** todo: should this be a BitSet? */
private Set<Integer> deletedDocuments;
private BitVector deletedDocuments;
private Map<String, Map<String, InstantiatedTerm>> termsByFieldAndText;
private InstantiatedTerm[] orderedTerms;
@ -85,7 +85,6 @@ public class InstantiatedIndex
orderedTerms = new InstantiatedTerm[0];
documentsByNumber = new InstantiatedDocument[0];
normsByFieldNameAndDocumentNumber = new HashMap<String, byte[]>();
deletedDocuments = new HashSet<Integer>();
}
@ -174,11 +173,14 @@ public class InstantiatedIndex
documentsByNumber = new InstantiatedDocument[sourceIndexReader.maxDoc()];
if (sourceIndexReader.hasDeletions()) {
deletedDocuments = new BitVector(sourceIndexReader.maxDoc());
}
// create documents
for (int i = 0; i < sourceIndexReader.maxDoc(); i++) {
if (sourceIndexReader.isDeleted(i)) {
deletedDocuments.add(i);
if (sourceIndexReader.hasDeletions() && sourceIndexReader.isDeleted(i)) {
deletedDocuments.set(i);
} else {
InstantiatedDocument document = new InstantiatedDocument();
// copy stored fields from source reader
@ -329,10 +331,13 @@ public class InstantiatedIndex
this.normsByFieldNameAndDocumentNumber = normsByFieldNameAndDocumentNumber;
}
public Set<Integer> getDeletedDocuments() {
public BitVector getDeletedDocuments() {
return deletedDocuments;
}
void setDeletedDocuments(BitVector deletedDocuments) {
this.deletedDocuments = deletedDocuments;
}
void setOrderedTerms(InstantiatedTerm[] orderedTerms) {
this.orderedTerms = orderedTerms;

View File

@ -29,16 +29,9 @@ import java.util.Set;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.index.TermVectorMapper;
import org.apache.lucene.index.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BitVector;
/**
* An InstantiatedIndexReader is not a snapshot in time, it is completely in
@ -105,9 +98,9 @@ public class InstantiatedIndexReader extends IndexReader {
return index;
}
private Set<InstantiatedDocument> deletedDocuments = new HashSet<InstantiatedDocument>();
private Set<Integer> deletedDocumentNumbers = new HashSet<Integer>();
private Map<String,List<NormUpdate>> updatedNormsByFieldNameAndDocumentNumber = null;
private BitVector uncommittedDeletedDocuments;
private Map<String,List<NormUpdate>> uncommittedNormsByFieldNameAndDocumentNumber = null;
private class NormUpdate {
private int doc;
@ -121,7 +114,15 @@ public class InstantiatedIndexReader extends IndexReader {
@Override
public int numDocs() {
return getIndex().getDocumentsByNumber().length - index.getDeletedDocuments().size() - deletedDocuments.size();
// todo i suppose this value could be cached, but array#length and bitvector#count is fast.
int numDocs = getIndex().getDocumentsByNumber().length;
if (uncommittedDeletedDocuments != null) {
numDocs -= uncommittedDeletedDocuments.count();
}
if (index.getDeletedDocuments() != null) {
numDocs -= index.getDeletedDocuments().count();
}
return numDocs;
}
@Override
@ -130,28 +131,39 @@ public class InstantiatedIndexReader extends IndexReader {
}
@Override
public boolean isDeleted(int n) {
return getIndex().getDeletedDocuments().contains(n) || deletedDocumentNumbers.contains(n);
public boolean hasDeletions() {
return index.getDeletedDocuments() != null || uncommittedDeletedDocuments != null;
}
@Override
public boolean hasDeletions() {
return getIndex().getDeletedDocuments().size() > 0 || deletedDocumentNumbers.size() > 0;
public boolean isDeleted(int n) {
return (index.getDeletedDocuments() != null && index.getDeletedDocuments().get(n))
|| (uncommittedDeletedDocuments != null && uncommittedDeletedDocuments.get(n));
}
@Override
protected void doDelete(int docNum) throws IOException {
if (!getIndex().getDeletedDocuments().contains(docNum)) {
if (deletedDocumentNumbers.add(docNum)) {
deletedDocuments.add(getIndex().getDocumentsByNumber()[docNum]);
}
// dont delete if already deleted
if ((index.getDeletedDocuments() != null && index.getDeletedDocuments().get(docNum))
|| (uncommittedDeletedDocuments != null && uncommittedDeletedDocuments.get(docNum))) {
return;
}
if (uncommittedDeletedDocuments == null) {
uncommittedDeletedDocuments = new BitVector(maxDoc());
}
uncommittedDeletedDocuments.set(docNum);
}
@Override
protected void doUndeleteAll() throws IOException {
deletedDocumentNumbers.clear();
deletedDocuments.clear();
// todo: read/write lock
uncommittedDeletedDocuments = null;
// todo: read/write unlock
}
@Override
@ -161,25 +173,30 @@ public class InstantiatedIndexReader extends IndexReader {
boolean updated = false;
// 1. update norms
if (updatedNormsByFieldNameAndDocumentNumber != null) {
for (Map.Entry<String,List<NormUpdate>> e : updatedNormsByFieldNameAndDocumentNumber.entrySet()) {
if (uncommittedNormsByFieldNameAndDocumentNumber != null) {
for (Map.Entry<String,List<NormUpdate>> e : uncommittedNormsByFieldNameAndDocumentNumber.entrySet()) {
byte[] norms = getIndex().getNormsByFieldNameAndDocumentNumber().get(e.getKey());
for (NormUpdate normUpdate : e.getValue()) {
norms[normUpdate.doc] = normUpdate.value;
}
}
updatedNormsByFieldNameAndDocumentNumber = null;
uncommittedNormsByFieldNameAndDocumentNumber = null;
updated = true;
}
// 2. remove deleted documents
if (deletedDocumentNumbers.size() > 0) {
for (Integer doc : deletedDocumentNumbers) {
getIndex().getDeletedDocuments().add(doc);
if (uncommittedDeletedDocuments != null) {
if (index.getDeletedDocuments() == null) {
index.setDeletedDocuments(uncommittedDeletedDocuments);
} else {
for (int d = 0; d< uncommittedDeletedDocuments.size(); d++) {
if (uncommittedDeletedDocuments.get(d)) {
index.getDeletedDocuments().set(d);
}
}
}
deletedDocumentNumbers.clear();
deletedDocuments.clear();
uncommittedDeletedDocuments = null;
updated = true;
@ -299,9 +316,9 @@ public class InstantiatedIndexReader extends IndexReader {
if (norms == null) {
return new byte[0]; // todo a static final zero length attribute?
}
if (updatedNormsByFieldNameAndDocumentNumber != null) {
if (uncommittedNormsByFieldNameAndDocumentNumber != null) {
norms = norms.clone();
List<NormUpdate> updated = updatedNormsByFieldNameAndDocumentNumber.get(field);
List<NormUpdate> updated = uncommittedNormsByFieldNameAndDocumentNumber.get(field);
if (updated != null) {
for (NormUpdate normUpdate : updated) {
norms[normUpdate.doc] = normUpdate.value;
@ -322,13 +339,13 @@ public class InstantiatedIndexReader extends IndexReader {
@Override
protected void doSetNorm(int doc, String field, byte value) throws IOException {
if (updatedNormsByFieldNameAndDocumentNumber == null) {
updatedNormsByFieldNameAndDocumentNumber = new HashMap<String,List<NormUpdate>>(getIndex().getNormsByFieldNameAndDocumentNumber().size());
if (uncommittedNormsByFieldNameAndDocumentNumber == null) {
uncommittedNormsByFieldNameAndDocumentNumber = new HashMap<String,List<NormUpdate>>(getIndex().getNormsByFieldNameAndDocumentNumber().size());
}
List<NormUpdate> list = updatedNormsByFieldNameAndDocumentNumber.get(field);
List<NormUpdate> list = uncommittedNormsByFieldNameAndDocumentNumber.get(field);
if (list == null) {
list = new LinkedList<NormUpdate>();
updatedNormsByFieldNameAndDocumentNumber.put(field, list);
uncommittedNormsByFieldNameAndDocumentNumber.put(field, list);
}
list.add(new NormUpdate(doc, value));
}
@ -367,6 +384,18 @@ public class InstantiatedIndexReader extends IndexReader {
return new InstantiatedTermDocs(this);
}
@Override
public TermDocs termDocs(Term term) throws IOException {
if (term == null) {
return new InstantiatedAllTermDocs(this);
} else {
InstantiatedTermDocs termDocs = new InstantiatedTermDocs(this);
termDocs.seek(term);
return termDocs;
}
}
@Override
public TermPositions termPositions() throws IOException {
return new InstantiatedTermPositions(this);
@ -411,7 +440,7 @@ public class InstantiatedIndexReader extends IndexReader {
@Override
public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException {
InstantiatedDocument doc = getIndex().getDocumentsByNumber()[docNumber];
for (Map.Entry<String,List<InstantiatedTermDocumentInformation>> e : doc.getVectorSpace().entrySet()) {
for (Map.Entry<String, List<InstantiatedTermDocumentInformation>> e : doc.getVectorSpace().entrySet()) {
mapper.setExpectations(e.getKey(), e.getValue().size(), true, true);
for (InstantiatedTermDocumentInformation tdi : e.getValue()) {
mapper.map(tdi.getTerm().text(), tdi.getTermPositions().length, tdi.getTermOffsets(), tdi.getTermPositions());

View File

@ -46,6 +46,7 @@ import org.apache.lucene.index.TermVectorOffsetInfo;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.BitVector;
/**
* This class, similar to {@link org.apache.lucene.index.IndexWriter}, has no locking mechanism.
@ -406,6 +407,18 @@ public class InstantiatedIndexWriter implements Closeable {
termDocumentInformationFactoryByDocument.clear();
fieldNameBuffer.clear();
// update deleted documents bitset
if (index.getDeletedDocuments() != null) {
BitVector deletedDocuments = new BitVector(index.getDocumentsByNumber().length);
for (int i = 0; i < index.getDeletedDocuments().size(); i++) {
if (index.getDeletedDocuments().get(i)) {
deletedDocuments.set(i);
}
}
index.setDeletedDocuments(deletedDocuments);
}
index.setVersion(System.currentTimeMillis());
// todo unlock

View File

@ -60,11 +60,14 @@ public class InstantiatedTermDocs
currentDocumentIndex++;
if (currentDocumentIndex < currentTerm.getAssociatedDocuments().length) {
currentDocumentInformation = currentTerm.getAssociatedDocuments()[currentDocumentIndex];
if (reader.hasDeletions() && reader.isDeleted(currentDocumentInformation.getDocument().getDocumentNumber())) {
if (reader.isDeleted(currentDocumentInformation.getDocument().getDocumentNumber())) {
return next();
} else {
return true;
}
} else {
// mimic SegmentTermDocs
currentDocumentIndex = currentTerm.getAssociatedDocuments().length -1;
}
}
return false;
@ -111,6 +114,8 @@ public class InstantiatedTermDocs
int pos = currentTerm.seekCeilingDocumentInformationIndex(target, startOffset);
if (pos == -1) {
// mimic SegmentTermDocs that positions at the last index
currentDocumentIndex = currentTerm.getAssociatedDocuments().length -1;
return false;
}

View File

@ -41,8 +41,6 @@ import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.AttributeImpl;
/**
@ -104,53 +102,144 @@ public class TestIndicesEquals extends TestCase {
}
instantiatedIndexWriter.close();
testEqualBehaviour(dir, ii);
testTermDocs(dir, ii);
}
private void testTermDocs(Directory aprioriIndex, InstantiatedIndex testIndex) throws Exception {
private void testTermDocsSomeMore(Directory aprioriIndex, InstantiatedIndex testIndex) throws Exception {
IndexReader aprioriReader = IndexReader.open(aprioriIndex, false);
IndexReader testReader = testIndex.indexReaderFactory();
TermEnum aprioriTermEnum = aprioriReader.terms(new Term("c", "danny"));
// test seek
TermDocs aprioriTermDocs = aprioriReader.termDocs(aprioriTermEnum.term());
TermDocs testTermDocs = testReader.termDocs(aprioriTermEnum.term());
Term t = new Term("c", "danny");
TermEnum aprioriTermEnum = aprioriReader.terms(t);
TermEnum testTermEnum = testReader.terms(t);
assertEquals(aprioriTermEnum.term(), testTermEnum.term());
t = aprioriTermEnum.term();
aprioriTermEnum.close();
testTermEnum.close();
TermDocs aprioriTermDocs = aprioriReader.termDocs(t);
TermDocs testTermDocs = testReader.termDocs(t);
assertEquals(aprioriTermDocs.next(), testTermDocs.next());
assertEquals(aprioriTermDocs.freq(), testTermDocs.freq());
assertEquals(aprioriTermDocs.doc(), testTermDocs.doc());
assertEquals(aprioriTermDocs.skipTo(100), testTermDocs.skipTo(100));
assertEquals(aprioriTermDocs.doc(), testTermDocs.doc());
if (aprioriTermDocs.skipTo(4)) {
assertTrue(testTermDocs.skipTo(4));
assertEquals(aprioriTermDocs.freq(), testTermDocs.freq());
assertEquals(aprioriTermDocs.doc(), testTermDocs.doc());
} else {
assertFalse(testTermDocs.skipTo(4));
}
assertEquals(aprioriTermDocs.next(), testTermDocs.next());
assertEquals(aprioriTermDocs.doc(), testTermDocs.doc());
if (aprioriTermDocs.next()) {
assertTrue(testTermDocs.next());
assertEquals(aprioriTermDocs.freq(), testTermDocs.freq());
assertEquals(aprioriTermDocs.doc(), testTermDocs.doc());
} else {
assertFalse(testTermDocs.next());
}
assertEquals(aprioriTermDocs.next(), testTermDocs.next());
assertEquals(aprioriTermDocs.doc(), testTermDocs.doc());
assertEquals(aprioriTermDocs.skipTo(110), testTermDocs.skipTo(110));
assertEquals(aprioriTermDocs.doc(), testTermDocs.doc());
// beyond this point all next and skipto will return false
assertEquals(aprioriTermDocs.skipTo(10), testTermDocs.skipTo(10));
assertEquals(aprioriTermDocs.doc(), testTermDocs.doc());
if (aprioriTermDocs.skipTo(100)) {
assertTrue(testTermDocs.skipTo(100));
assertEquals(aprioriTermDocs.freq(), testTermDocs.freq());
assertEquals(aprioriTermDocs.doc(), testTermDocs.doc());
} else {
assertFalse(testTermDocs.skipTo(100));
}
assertEquals(aprioriTermDocs.skipTo(210), testTermDocs.skipTo(210));
assertEquals(aprioriTermDocs.doc(), testTermDocs.doc());
if (aprioriTermDocs.next()) {
assertTrue(testTermDocs.next());
assertEquals(aprioriTermDocs.freq(), testTermDocs.freq());
assertEquals(aprioriTermDocs.doc(), testTermDocs.doc());
} else {
assertFalse(testTermDocs.next());
}
if (aprioriTermDocs.skipTo(110)) {
assertTrue(testTermDocs.skipTo(110));
assertEquals(aprioriTermDocs.freq(), testTermDocs.freq());
assertEquals(aprioriTermDocs.doc(), testTermDocs.doc());
} else {
assertFalse(testTermDocs.skipTo(110));
}
if (aprioriTermDocs.skipTo(10)) {
assertTrue(testTermDocs.skipTo(10));
assertEquals(aprioriTermDocs.freq(), testTermDocs.freq());
assertEquals(aprioriTermDocs.doc(), testTermDocs.doc());
} else {
assertFalse(testTermDocs.skipTo(10));
}
if (aprioriTermDocs.skipTo(210)) {
assertTrue(testTermDocs.skipTo(210));
assertEquals(aprioriTermDocs.freq(), testTermDocs.freq());
assertEquals(aprioriTermDocs.doc(), testTermDocs.doc());
} else {
assertFalse(testTermDocs.skipTo(210));
}
aprioriTermDocs.close();
aprioriReader.close();
testTermDocs.close();
// test seek null (AllTermDocs)
aprioriTermDocs = aprioriReader.termDocs(null);
testTermDocs = testReader.termDocs(null);
while (aprioriTermDocs.next()) {
assertTrue(testTermDocs.next());
assertEquals(aprioriTermDocs.freq(), testTermDocs.freq());
assertEquals(aprioriTermDocs.doc(), testTermDocs.doc());
}
assertFalse(testTermDocs.next());
aprioriTermDocs.close();
testTermDocs.close();
// test seek default
aprioriTermDocs = aprioriReader.termDocs();
testTermDocs = testReader.termDocs();
// todo consider seeking and skipping some too
while (aprioriTermDocs.next()) {
assertTrue(testTermDocs.next());
assertEquals(aprioriTermDocs.freq(), testTermDocs.freq());
assertEquals(aprioriTermDocs.doc(), testTermDocs.doc());
}
assertFalse(testTermDocs.next());
aprioriTermDocs.close();
testTermDocs.close();
// clean up
aprioriReader.close();
testReader.close();
}
private void assembleDocument(Document document, int i) {
document.add(new Field("a", i + " Do you really want to go and live in that house all winter?", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
if (i > 0) {
@ -213,18 +302,57 @@ public class TestIndicesEquals extends TestCase {
*/
protected void testEqualBehaviour(Directory aprioriIndex, InstantiatedIndex testIndex) throws Exception {
testEquals(aprioriIndex, testIndex);
// delete a few documents
IndexReader ir = IndexReader.open(aprioriIndex, false);
ir.deleteDocument(3);
ir.deleteDocument(8);
ir.close();
// delete a few documents
IndexReader air = IndexReader.open(aprioriIndex, false);
InstantiatedIndexReader tir = testIndex.indexReaderFactory();
assertEquals(air.isCurrent(), tir.isCurrent());
assertEquals(air.hasDeletions(), tir.hasDeletions());
assertEquals(air.maxDoc(), tir.maxDoc());
assertEquals(air.numDocs(), tir.numDocs());
assertEquals(air.numDeletedDocs(), tir.numDeletedDocs());
air.deleteDocument(3);
tir.deleteDocument(3);
assertEquals(air.isCurrent(), tir.isCurrent());
assertEquals(air.hasDeletions(), tir.hasDeletions());
assertEquals(air.maxDoc(), tir.maxDoc());
assertEquals(air.numDocs(), tir.numDocs());
assertEquals(air.numDeletedDocs(), tir.numDeletedDocs());
air.deleteDocument(8);
tir.deleteDocument(8);
assertEquals(air.isCurrent(), tir.isCurrent());
assertEquals(air.hasDeletions(), tir.hasDeletions());
assertEquals(air.maxDoc(), tir.maxDoc());
assertEquals(air.numDocs(), tir.numDocs());
assertEquals(air.numDeletedDocs(), tir.numDeletedDocs());
// this (in 3.0) commits the deletions
air.close();
tir.close();
air = IndexReader.open(aprioriIndex, false);
tir = testIndex.indexReaderFactory();
assertEquals(air.isCurrent(), tir.isCurrent());
assertEquals(air.hasDeletions(), tir.hasDeletions());
assertEquals(air.maxDoc(), tir.maxDoc());
assertEquals(air.numDocs(), tir.numDocs());
assertEquals(air.numDeletedDocs(), tir.numDeletedDocs());
for (int d =0; d<air.maxDoc(); d++) {
assertEquals(air.isDeleted(d), tir.isDeleted(d));
}
air.close();
tir.close();
ir = testIndex.indexReaderFactory();
ir.deleteDocument(3);
ir.deleteDocument(8);
ir.close();
// make sure they still equal
testEquals(aprioriIndex, testIndex);
@ -232,6 +360,8 @@ public class TestIndicesEquals extends TestCase {
protected void testEquals(Directory aprioriIndex, InstantiatedIndex testIndex) throws Exception {
testTermDocsSomeMore(aprioriIndex, testIndex);
IndexReader aprioriReader = IndexReader.open(aprioriIndex, false);
IndexReader testReader = testIndex.indexReaderFactory();

View File

@ -18,69 +18,19 @@
package org.apache.lucene.index;
import org.apache.lucene.util.BitVector;
import java.io.IOException;
class AllTermDocs implements TermDocs {
class AllTermDocs extends AbstractAllTermDocs {
protected BitVector deletedDocs;
protected int maxDoc;
protected int doc = -1;
protected AllTermDocs(SegmentReader parent) {
super(parent.maxDoc());
synchronized (parent) {
this.deletedDocs = parent.deletedDocs;
}
this.maxDoc = parent.maxDoc();
}
public void seek(Term term) throws IOException {
if (term==null) {
doc = -1;
} else {
throw new UnsupportedOperationException();
}
}
public void seek(TermEnum termEnum) throws IOException {
throw new UnsupportedOperationException();
}
public int doc() {
return doc;
}
public int freq() {
return 1;
}
public boolean next() throws IOException {
return skipTo(doc+1);
}
public int read(int[] docs, int[] freqs) throws IOException {
final int length = docs.length;
int i = 0;
while (i < length && doc < maxDoc) {
if (deletedDocs == null || !deletedDocs.get(doc)) {
docs[i] = doc;
freqs[i] = 1;
++i;
}
doc++;
}
return i;
}
public boolean skipTo(int target) throws IOException {
doc = target;
while (doc < maxDoc) {
if (deletedDocs == null || !deletedDocs.get(doc)) {
return true;
}
doc++;
}
return false;
}
public void close() throws IOException {
public boolean isDeleted(int doc) {
return deletedDocs != null && deletedDocs.get(doc);
}
}

View File

@ -907,7 +907,7 @@ public class SegmentReader extends IndexReader implements Cloneable {
}
/**
* @see IndexReader#getFieldNames(IndexReader.FieldOption fldOption)
* @see IndexReader#getFieldNames(org.apache.lucene.index.IndexReader.FieldOption)
*/
@Override
public Collection<String> getFieldNames(IndexReader.FieldOption fieldOption) {