LUCENE-4598: Change PayloadIterator to not use top-level reader API

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1419397 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shai Erera 2012-12-10 12:58:28 +00:00
parent e01ed8f674
commit dffaebd814
4 changed files with 94 additions and 84 deletions

View File

@ -299,6 +299,9 @@ Optimizations
* LUCENE-4580: DrillDown.query variants return a ConstantScoreQuery with boost set to 0.0f
so that documents scores are not affected by running a drill-down query. (Shai Erera)
* LUCENE-4598: PayloadIterator no longer uses top-level IndexReader to iterate on the
posting's payload. (Shai Erera, Michael McCandless)
Documentation
* LUCENE-4483: Refer to BytesRef.deepCopyOf in Term's constructor that takes BytesRef.

View File

@ -52,12 +52,10 @@ public class EnhancementsPayloadIterator extends PayloadIterator {
* The category term to iterate.
* @throws IOException If there is a low-level I/O error.
*/
public EnhancementsPayloadIterator(
List<CategoryEnhancement> enhancementsList,
public EnhancementsPayloadIterator(List<CategoryEnhancement> enhancementsList,
IndexReader indexReader, Term term) throws IOException {
super(indexReader, term);
EnhancedCategories = enhancementsList
.toArray(new CategoryEnhancement[enhancementsList.size()]);
EnhancedCategories = enhancementsList.toArray(new CategoryEnhancement[enhancementsList.size()]);
enhancementLength = new int[EnhancedCategories.length];
enhancementStart = new int[EnhancedCategories.length];
}
@ -69,10 +67,10 @@ public class EnhancementsPayloadIterator extends PayloadIterator {
}
// read header - number of enhancements and their lengths
Position position = new Position();
nEnhancements = Vint8.decode(buffer, position);
Position position = new Position(data.offset);
nEnhancements = Vint8.decode(data.bytes, position);
for (int i = 0; i < nEnhancements; i++) {
enhancementLength[i] = Vint8.decode(buffer, position);
enhancementLength[i] = Vint8.decode(data.bytes, position);
}
// set enhancements start points
@ -96,7 +94,7 @@ public class EnhancementsPayloadIterator extends PayloadIterator {
public Object getCategoryData(CategoryEnhancement enhancedCategory) {
for (int i = 0; i < nEnhancements; i++) {
if (enhancedCategory.equals(EnhancedCategories[i])) {
return enhancedCategory.extractCategoryTokenData(buffer,
return enhancedCategory.extractCategoryTokenData(data.bytes,
enhancementStart[i], enhancementLength[i]);
}
}

View File

@ -4,7 +4,7 @@ import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnsafeByteArrayInputStream;
import org.apache.lucene.util.encoding.IntDecoder;
@ -61,14 +61,8 @@ public class PayloadIntDecodingIterator implements CategoryListIterator {
private final PayloadIterator pi;
private final int hashCode;
public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder)
throws IOException {
this(indexReader, term, decoder, new byte[1024]);
}
public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder,
byte[] buffer) throws IOException {
pi = new PayloadIterator(indexReader, term, buffer);
public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder) throws IOException {
pi = new PayloadIterator(indexReader, term);
ubais = new UnsafeByteArrayInputStream();
this.decoder = decoder;
hashCode = indexReader.hashCode() ^ term.hashCode();
@ -95,21 +89,25 @@ public class PayloadIntDecodingIterator implements CategoryListIterator {
return hashCode;
}
@Override
public boolean init() throws IOException {
return pi.init();
}
@Override
public long nextCategory() throws IOException {
return decoder.decode();
}
@Override
public boolean skipTo(int docId) throws IOException {
if (!pi.setdoc(docId)) {
return false;
}
// Initializing the decoding mechanism with the new payload data
ubais.reInit(pi.getBuffer(), 0, pi.getPayloadLength());
BytesRef data = pi.getPayload();
ubais.reInit(data.bytes, data.offset, data.length + data.offset);
decoder.reInit(ubais);
return true;
}

View File

@ -1,13 +1,16 @@
package org.apache.lucene.facet.search;
import java.io.IOException;
import java.util.Iterator;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
/*
@ -29,40 +32,63 @@ import org.apache.lucene.util.BytesRef;
/**
* A utility class for iterating through a posting list of a given term and
* retrieving the payload of the first occurrence in every document. Comes with
* its own working space (buffer).
* retrieving the payload of the first position in every document. For
* efficiency, this class does not check if documents passed to
* {@link #setdoc(int)} are deleted, since it is usually used to iterate on
* payloads of documents that matched a query. If you need to skip over deleted
* documents, you should do so before calling {@link #setdoc(int)}.
*
* @lucene.experimental
*/
public class PayloadIterator {
protected byte[] buffer;
protected int payloadLength;
DocsAndPositionsEnum tp;
protected BytesRef data;
private TermsEnum reuseTE;
private DocsAndPositionsEnum currentDPE;
private boolean hasMore;
private int curDocID, curDocBase;
private final Iterator<AtomicReaderContext> leaves;
private final Term term;
public PayloadIterator(IndexReader indexReader, Term term)
throws IOException {
this(indexReader, term, new byte[1024]);
public PayloadIterator(IndexReader indexReader, Term term) throws IOException {
leaves = indexReader.leaves().iterator();
this.term = term;
}
public PayloadIterator(IndexReader indexReader, Term term, byte[] buffer)
throws IOException {
this.buffer = buffer;
// TODO (Facet): avoid Multi*?
Bits liveDocs = MultiFields.getLiveDocs(indexReader);
this.tp = MultiFields.getTermPositionsEnum(indexReader, liveDocs, term.field(), term.bytes(), DocsAndPositionsEnum.FLAG_PAYLOADS);
private void nextSegment() throws IOException {
hasMore = false;
while (leaves.hasNext()) {
AtomicReaderContext ctx = leaves.next();
curDocBase = ctx.docBase;
Fields fields = ctx.reader().fields();
if (fields != null) {
Terms terms = fields.terms(term.field());
if (terms != null) {
reuseTE = terms.iterator(reuseTE);
if (reuseTE.seekExact(term.bytes(), true)) {
// this class is usually used to iterate on whatever a Query matched
// if it didn't match deleted documents, we won't receive them. if it
// did, we should iterate on them too, therefore we pass liveDocs=null
currentDPE = reuseTE.docsAndPositions(null, currentDPE, DocsAndPositionsEnum.FLAG_PAYLOADS);
if (currentDPE != null && (curDocID = currentDPE.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
hasMore = true;
break;
}
}
}
}
}
}
/**
* (re)initialize the iterator. Should be done before the first call to
* {@link #setdoc(int)}. Returns false if there is no category list found
* (no setdoc() will never return true).
* Initialize the iterator. Should be done before the first call to
* {@link #setdoc(int)}. Returns {@code false} if no category list is found,
* or the category list has no documents.
*/
public boolean init() throws IOException {
hasMore = tp != null && tp.nextDoc() != DocIdSetIterator.NO_MORE_DOCS;
nextSegment();
return hasMore;
}
@ -77,59 +103,44 @@ public class PayloadIterator {
if (!hasMore) {
return false;
}
// re-basing docId->localDocID is done fewer times than currentDoc->globalDoc
int localDocID = docId - curDocBase;
if (tp.docID() > docId) {
if (curDocID > localDocID) {
// document does not exist
return false;
}
// making sure we have the requested document
if (tp.docID() < docId) {
// Skipping to requested document
if (tp.advance(docId) == DocIdSetIterator.NO_MORE_DOCS) {
this.hasMore = false;
return false;
if (curDocID < localDocID) {
// look for the document either in that segment, or others
while (hasMore && (curDocID = currentDPE.advance(localDocID)) == DocIdSetIterator.NO_MORE_DOCS) {
nextSegment(); // also updates curDocID
localDocID = docId - curDocBase;
// nextSegment advances to nextDoc, so check if we still need to advance
if (curDocID >= localDocID) {
break;
}
}
// If document not found (skipped to much)
if (tp.docID() != docId) {
// we break from the above loop when:
// 1. we iterated over all segments (hasMore=false)
// 2. current segment advanced to a doc, either requested or higher
if (!hasMore || curDocID != localDocID) {
return false;
}
}
// Prepare for payload extraction
tp.nextPosition();
BytesRef br = tp.getPayload();
if (br == null) {
return false;
}
assert br.length > 0;
this.payloadLength = br.length;
if (this.payloadLength > this.buffer.length) {
// Growing if necessary.
this.buffer = new byte[this.payloadLength * 2 + 1];
}
// Loading the payload
System.arraycopy(br.bytes, br.offset, this.buffer, 0, payloadLength);
return true;
// we're on the document
assert currentDPE.freq() == 1 : "expecting freq=1 (got " + currentDPE.freq() + ") term=" + term + " doc=" + (curDocID + curDocBase);
int pos = currentDPE.nextPosition();
assert pos != -1 : "no positions for term=" + term + " doc=" + (curDocID + curDocBase);
data = currentDPE.getPayload();
return data != null;
}
/**
* Get the buffer with the content of the last read payload.
*/
public byte[] getBuffer() {
return buffer;
}
/**
* Get the length of the last read payload.
*/
public int getPayloadLength() {
return payloadLength;
public BytesRef getPayload() {
return data;
}
}