LUCENE-4598: Change PayloadIterator to not use top-level reader API

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1419397 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shai Erera 2012-12-10 12:58:28 +00:00
parent e01ed8f674
commit dffaebd814
4 changed files with 94 additions and 84 deletions

View File

@ -299,6 +299,9 @@ Optimizations
* LUCENE-4580: DrillDown.query variants return a ConstantScoreQuery with boost set to 0.0f * LUCENE-4580: DrillDown.query variants return a ConstantScoreQuery with boost set to 0.0f
so that documents scores are not affected by running a drill-down query. (Shai Erera) so that documents scores are not affected by running a drill-down query. (Shai Erera)
* LUCENE-4598: PayloadIterator no longer uses top-level IndexReader to iterate on the
posting's payload. (Shai Erera, Michael McCandless)
Documentation Documentation
* LUCENE-4483: Refer to BytesRef.deepCopyOf in Term's constructor that takes BytesRef. * LUCENE-4483: Refer to BytesRef.deepCopyOf in Term's constructor that takes BytesRef.

View File

@ -52,12 +52,10 @@ public class EnhancementsPayloadIterator extends PayloadIterator {
* The category term to iterate. * The category term to iterate.
* @throws IOException If there is a low-level I/O error. * @throws IOException If there is a low-level I/O error.
*/ */
public EnhancementsPayloadIterator( public EnhancementsPayloadIterator(List<CategoryEnhancement> enhancementsList,
List<CategoryEnhancement> enhancementsList,
IndexReader indexReader, Term term) throws IOException { IndexReader indexReader, Term term) throws IOException {
super(indexReader, term); super(indexReader, term);
EnhancedCategories = enhancementsList EnhancedCategories = enhancementsList.toArray(new CategoryEnhancement[enhancementsList.size()]);
.toArray(new CategoryEnhancement[enhancementsList.size()]);
enhancementLength = new int[EnhancedCategories.length]; enhancementLength = new int[EnhancedCategories.length];
enhancementStart = new int[EnhancedCategories.length]; enhancementStart = new int[EnhancedCategories.length];
} }
@ -69,10 +67,10 @@ public class EnhancementsPayloadIterator extends PayloadIterator {
} }
// read header - number of enhancements and their lengths // read header - number of enhancements and their lengths
Position position = new Position(); Position position = new Position(data.offset);
nEnhancements = Vint8.decode(buffer, position); nEnhancements = Vint8.decode(data.bytes, position);
for (int i = 0; i < nEnhancements; i++) { for (int i = 0; i < nEnhancements; i++) {
enhancementLength[i] = Vint8.decode(buffer, position); enhancementLength[i] = Vint8.decode(data.bytes, position);
} }
// set enhancements start points // set enhancements start points
@ -96,7 +94,7 @@ public class EnhancementsPayloadIterator extends PayloadIterator {
public Object getCategoryData(CategoryEnhancement enhancedCategory) { public Object getCategoryData(CategoryEnhancement enhancedCategory) {
for (int i = 0; i < nEnhancements; i++) { for (int i = 0; i < nEnhancements; i++) {
if (enhancedCategory.equals(EnhancedCategories[i])) { if (enhancedCategory.equals(EnhancedCategories[i])) {
return enhancedCategory.extractCategoryTokenData(buffer, return enhancedCategory.extractCategoryTokenData(data.bytes,
enhancementStart[i], enhancementLength[i]); enhancementStart[i], enhancementLength[i]);
} }
} }

View File

@ -4,7 +4,7 @@ import java.io.IOException;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnsafeByteArrayInputStream; import org.apache.lucene.util.UnsafeByteArrayInputStream;
import org.apache.lucene.util.encoding.IntDecoder; import org.apache.lucene.util.encoding.IntDecoder;
@ -61,14 +61,8 @@ public class PayloadIntDecodingIterator implements CategoryListIterator {
private final PayloadIterator pi; private final PayloadIterator pi;
private final int hashCode; private final int hashCode;
public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder) public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder) throws IOException {
throws IOException { pi = new PayloadIterator(indexReader, term);
this(indexReader, term, decoder, new byte[1024]);
}
public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder,
byte[] buffer) throws IOException {
pi = new PayloadIterator(indexReader, term, buffer);
ubais = new UnsafeByteArrayInputStream(); ubais = new UnsafeByteArrayInputStream();
this.decoder = decoder; this.decoder = decoder;
hashCode = indexReader.hashCode() ^ term.hashCode(); hashCode = indexReader.hashCode() ^ term.hashCode();
@ -95,21 +89,25 @@ public class PayloadIntDecodingIterator implements CategoryListIterator {
return hashCode; return hashCode;
} }
@Override
public boolean init() throws IOException { public boolean init() throws IOException {
return pi.init(); return pi.init();
} }
@Override
public long nextCategory() throws IOException { public long nextCategory() throws IOException {
return decoder.decode(); return decoder.decode();
} }
@Override
public boolean skipTo(int docId) throws IOException { public boolean skipTo(int docId) throws IOException {
if (!pi.setdoc(docId)) { if (!pi.setdoc(docId)) {
return false; return false;
} }
// Initializing the decoding mechanism with the new payload data // Initializing the decoding mechanism with the new payload data
ubais.reInit(pi.getBuffer(), 0, pi.getPayloadLength()); BytesRef data = pi.getPayload();
ubais.reInit(data.bytes, data.offset, data.length + data.offset);
decoder.reInit(ubais); decoder.reInit(ubais);
return true; return true;
} }

View File

@ -1,13 +1,16 @@
package org.apache.lucene.facet.search; package org.apache.lucene.facet.search;
import java.io.IOException; import java.io.IOException;
import java.util.Iterator;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
/* /*
@ -29,40 +32,63 @@ import org.apache.lucene.util.BytesRef;
/** /**
* A utility class for iterating through a posting list of a given term and * A utility class for iterating through a posting list of a given term and
* retrieving the payload of the first occurrence in every document. Comes with * retrieving the payload of the first position in every document. For
* its own working space (buffer). * efficiency, this class does not check if documents passed to
* {@link #setdoc(int)} are deleted, since it is usually used to iterate on
* payloads of documents that matched a query. If you need to skip over deleted
* documents, you should do so before calling {@link #setdoc(int)}.
* *
* @lucene.experimental * @lucene.experimental
*/ */
public class PayloadIterator { public class PayloadIterator {
protected byte[] buffer; protected BytesRef data;
protected int payloadLength;
DocsAndPositionsEnum tp;
private TermsEnum reuseTE;
private DocsAndPositionsEnum currentDPE;
private boolean hasMore; private boolean hasMore;
private int curDocID, curDocBase;
private final Iterator<AtomicReaderContext> leaves;
private final Term term;
public PayloadIterator(IndexReader indexReader, Term term) public PayloadIterator(IndexReader indexReader, Term term) throws IOException {
throws IOException { leaves = indexReader.leaves().iterator();
this(indexReader, term, new byte[1024]); this.term = term;
} }
public PayloadIterator(IndexReader indexReader, Term term, byte[] buffer) private void nextSegment() throws IOException {
throws IOException { hasMore = false;
this.buffer = buffer; while (leaves.hasNext()) {
// TODO (Facet): avoid Multi*? AtomicReaderContext ctx = leaves.next();
Bits liveDocs = MultiFields.getLiveDocs(indexReader); curDocBase = ctx.docBase;
this.tp = MultiFields.getTermPositionsEnum(indexReader, liveDocs, term.field(), term.bytes(), DocsAndPositionsEnum.FLAG_PAYLOADS); Fields fields = ctx.reader().fields();
if (fields != null) {
Terms terms = fields.terms(term.field());
if (terms != null) {
reuseTE = terms.iterator(reuseTE);
if (reuseTE.seekExact(term.bytes(), true)) {
// this class is usually used to iterate on whatever a Query matched
// if it didn't match deleted documents, we won't receive them. if it
// did, we should iterate on them too, therefore we pass liveDocs=null
currentDPE = reuseTE.docsAndPositions(null, currentDPE, DocsAndPositionsEnum.FLAG_PAYLOADS);
if (currentDPE != null && (curDocID = currentDPE.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
hasMore = true;
break;
}
}
}
}
}
} }
/** /**
* (re)initialize the iterator. Should be done before the first call to * Initialize the iterator. Should be done before the first call to
* {@link #setdoc(int)}. Returns false if there is no category list found * {@link #setdoc(int)}. Returns {@code false} if no category list is found,
* (no setdoc() will never return true). * or the category list has no documents.
*/ */
public boolean init() throws IOException { public boolean init() throws IOException {
hasMore = tp != null && tp.nextDoc() != DocIdSetIterator.NO_MORE_DOCS; nextSegment();
return hasMore; return hasMore;
} }
@ -77,59 +103,44 @@ public class PayloadIterator {
if (!hasMore) { if (!hasMore) {
return false; return false;
} }
// re-basing docId->localDocID is done fewer times than currentDoc->globalDoc
int localDocID = docId - curDocBase;
if (tp.docID() > docId) { if (curDocID > localDocID) {
// document does not exist
return false; return false;
} }
// making sure we have the requested document if (curDocID < localDocID) {
if (tp.docID() < docId) { // look for the document either in that segment, or others
// Skipping to requested document while (hasMore && (curDocID = currentDPE.advance(localDocID)) == DocIdSetIterator.NO_MORE_DOCS) {
if (tp.advance(docId) == DocIdSetIterator.NO_MORE_DOCS) { nextSegment(); // also updates curDocID
this.hasMore = false; localDocID = docId - curDocBase;
return false; // nextSegment advances to nextDoc, so check if we still need to advance
if (curDocID >= localDocID) {
break;
}
} }
// If document not found (skipped to much) // we break from the above loop when:
if (tp.docID() != docId) { // 1. we iterated over all segments (hasMore=false)
// 2. current segment advanced to a doc, either requested or higher
if (!hasMore || curDocID != localDocID) {
return false; return false;
} }
} }
// Prepare for payload extraction // we're on the document
tp.nextPosition(); assert currentDPE.freq() == 1 : "expecting freq=1 (got " + currentDPE.freq() + ") term=" + term + " doc=" + (curDocID + curDocBase);
int pos = currentDPE.nextPosition();
BytesRef br = tp.getPayload(); assert pos != -1 : "no positions for term=" + term + " doc=" + (curDocID + curDocBase);
data = currentDPE.getPayload();
if (br == null) { return data != null;
return false;
}
assert br.length > 0;
this.payloadLength = br.length;
if (this.payloadLength > this.buffer.length) {
// Growing if necessary.
this.buffer = new byte[this.payloadLength * 2 + 1];
}
// Loading the payload
System.arraycopy(br.bytes, br.offset, this.buffer, 0, payloadLength);
return true;
} }
/** public BytesRef getPayload() {
* Get the buffer with the content of the last read payload. return data;
*/
public byte[] getBuffer() {
return buffer;
}
/**
* Get the length of the last read payload.
*/
public int getPayloadLength() {
return payloadLength;
} }
} }