mirror of https://github.com/apache/lucene.git
LUCENE-4598: Change PayloadIterator to not use top-level reader API
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1419397 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e01ed8f674
commit
dffaebd814
|
@ -299,6 +299,9 @@ Optimizations
|
|||
* LUCENE-4580: DrillDown.query variants return a ConstantScoreQuery with boost set to 0.0f
|
||||
so that documents scores are not affected by running a drill-down query. (Shai Erera)
|
||||
|
||||
* LUCENE-4598: PayloadIterator no longer uses top-level IndexReader to iterate on the
|
||||
posting's payload. (Shai Erera, Michael McCandless)
|
||||
|
||||
Documentation
|
||||
|
||||
* LUCENE-4483: Refer to BytesRef.deepCopyOf in Term's constructor that takes BytesRef.
|
||||
|
|
|
@ -52,12 +52,10 @@ public class EnhancementsPayloadIterator extends PayloadIterator {
|
|||
* The category term to iterate.
|
||||
* @throws IOException If there is a low-level I/O error.
|
||||
*/
|
||||
public EnhancementsPayloadIterator(
|
||||
List<CategoryEnhancement> enhancementsList,
|
||||
public EnhancementsPayloadIterator(List<CategoryEnhancement> enhancementsList,
|
||||
IndexReader indexReader, Term term) throws IOException {
|
||||
super(indexReader, term);
|
||||
EnhancedCategories = enhancementsList
|
||||
.toArray(new CategoryEnhancement[enhancementsList.size()]);
|
||||
EnhancedCategories = enhancementsList.toArray(new CategoryEnhancement[enhancementsList.size()]);
|
||||
enhancementLength = new int[EnhancedCategories.length];
|
||||
enhancementStart = new int[EnhancedCategories.length];
|
||||
}
|
||||
|
@ -69,10 +67,10 @@ public class EnhancementsPayloadIterator extends PayloadIterator {
|
|||
}
|
||||
|
||||
// read header - number of enhancements and their lengths
|
||||
Position position = new Position();
|
||||
nEnhancements = Vint8.decode(buffer, position);
|
||||
Position position = new Position(data.offset);
|
||||
nEnhancements = Vint8.decode(data.bytes, position);
|
||||
for (int i = 0; i < nEnhancements; i++) {
|
||||
enhancementLength[i] = Vint8.decode(buffer, position);
|
||||
enhancementLength[i] = Vint8.decode(data.bytes, position);
|
||||
}
|
||||
|
||||
// set enhancements start points
|
||||
|
@ -96,7 +94,7 @@ public class EnhancementsPayloadIterator extends PayloadIterator {
|
|||
public Object getCategoryData(CategoryEnhancement enhancedCategory) {
|
||||
for (int i = 0; i < nEnhancements; i++) {
|
||||
if (enhancedCategory.equals(EnhancedCategories[i])) {
|
||||
return enhancedCategory.extractCategoryTokenData(buffer,
|
||||
return enhancedCategory.extractCategoryTokenData(data.bytes,
|
||||
enhancementStart[i], enhancementLength[i]);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,7 +4,7 @@ import java.io.IOException;
|
|||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.UnsafeByteArrayInputStream;
|
||||
import org.apache.lucene.util.encoding.IntDecoder;
|
||||
|
||||
|
@ -61,14 +61,8 @@ public class PayloadIntDecodingIterator implements CategoryListIterator {
|
|||
private final PayloadIterator pi;
|
||||
private final int hashCode;
|
||||
|
||||
public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder)
|
||||
throws IOException {
|
||||
this(indexReader, term, decoder, new byte[1024]);
|
||||
}
|
||||
|
||||
public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder,
|
||||
byte[] buffer) throws IOException {
|
||||
pi = new PayloadIterator(indexReader, term, buffer);
|
||||
public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder) throws IOException {
|
||||
pi = new PayloadIterator(indexReader, term);
|
||||
ubais = new UnsafeByteArrayInputStream();
|
||||
this.decoder = decoder;
|
||||
hashCode = indexReader.hashCode() ^ term.hashCode();
|
||||
|
@ -95,21 +89,25 @@ public class PayloadIntDecodingIterator implements CategoryListIterator {
|
|||
return hashCode;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean init() throws IOException {
|
||||
return pi.init();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long nextCategory() throws IOException {
|
||||
return decoder.decode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean skipTo(int docId) throws IOException {
|
||||
if (!pi.setdoc(docId)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Initializing the decoding mechanism with the new payload data
|
||||
ubais.reInit(pi.getBuffer(), 0, pi.getPayloadLength());
|
||||
BytesRef data = pi.getPayload();
|
||||
ubais.reInit(data.bytes, data.offset, data.length + data.offset);
|
||||
decoder.reInit(ubais);
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -1,13 +1,16 @@
|
|||
package org.apache.lucene.facet.search;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/*
|
||||
|
@ -29,40 +32,63 @@ import org.apache.lucene.util.BytesRef;
|
|||
|
||||
/**
|
||||
* A utility class for iterating through a posting list of a given term and
|
||||
* retrieving the payload of the first occurrence in every document. Comes with
|
||||
* its own working space (buffer).
|
||||
* retrieving the payload of the first position in every document. For
|
||||
* efficiency, this class does not check if documents passed to
|
||||
* {@link #setdoc(int)} are deleted, since it is usually used to iterate on
|
||||
* payloads of documents that matched a query. If you need to skip over deleted
|
||||
* documents, you should do so before calling {@link #setdoc(int)}.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class PayloadIterator {
|
||||
|
||||
protected byte[] buffer;
|
||||
protected int payloadLength;
|
||||
|
||||
DocsAndPositionsEnum tp;
|
||||
protected BytesRef data;
|
||||
|
||||
private TermsEnum reuseTE;
|
||||
private DocsAndPositionsEnum currentDPE;
|
||||
private boolean hasMore;
|
||||
private int curDocID, curDocBase;
|
||||
|
||||
private final Iterator<AtomicReaderContext> leaves;
|
||||
private final Term term;
|
||||
|
||||
public PayloadIterator(IndexReader indexReader, Term term)
|
||||
throws IOException {
|
||||
this(indexReader, term, new byte[1024]);
|
||||
public PayloadIterator(IndexReader indexReader, Term term) throws IOException {
|
||||
leaves = indexReader.leaves().iterator();
|
||||
this.term = term;
|
||||
}
|
||||
|
||||
public PayloadIterator(IndexReader indexReader, Term term, byte[] buffer)
|
||||
throws IOException {
|
||||
this.buffer = buffer;
|
||||
// TODO (Facet): avoid Multi*?
|
||||
Bits liveDocs = MultiFields.getLiveDocs(indexReader);
|
||||
this.tp = MultiFields.getTermPositionsEnum(indexReader, liveDocs, term.field(), term.bytes(), DocsAndPositionsEnum.FLAG_PAYLOADS);
|
||||
private void nextSegment() throws IOException {
|
||||
hasMore = false;
|
||||
while (leaves.hasNext()) {
|
||||
AtomicReaderContext ctx = leaves.next();
|
||||
curDocBase = ctx.docBase;
|
||||
Fields fields = ctx.reader().fields();
|
||||
if (fields != null) {
|
||||
Terms terms = fields.terms(term.field());
|
||||
if (terms != null) {
|
||||
reuseTE = terms.iterator(reuseTE);
|
||||
if (reuseTE.seekExact(term.bytes(), true)) {
|
||||
// this class is usually used to iterate on whatever a Query matched
|
||||
// if it didn't match deleted documents, we won't receive them. if it
|
||||
// did, we should iterate on them too, therefore we pass liveDocs=null
|
||||
currentDPE = reuseTE.docsAndPositions(null, currentDPE, DocsAndPositionsEnum.FLAG_PAYLOADS);
|
||||
if (currentDPE != null && (curDocID = currentDPE.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
hasMore = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* (re)initialize the iterator. Should be done before the first call to
|
||||
* {@link #setdoc(int)}. Returns false if there is no category list found
|
||||
* (no setdoc() will never return true).
|
||||
* Initialize the iterator. Should be done before the first call to
|
||||
* {@link #setdoc(int)}. Returns {@code false} if no category list is found,
|
||||
* or the category list has no documents.
|
||||
*/
|
||||
public boolean init() throws IOException {
|
||||
hasMore = tp != null && tp.nextDoc() != DocIdSetIterator.NO_MORE_DOCS;
|
||||
nextSegment();
|
||||
return hasMore;
|
||||
}
|
||||
|
||||
|
@ -77,59 +103,44 @@ public class PayloadIterator {
|
|||
if (!hasMore) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// re-basing docId->localDocID is done fewer times than currentDoc->globalDoc
|
||||
int localDocID = docId - curDocBase;
|
||||
|
||||
if (tp.docID() > docId) {
|
||||
if (curDocID > localDocID) {
|
||||
// document does not exist
|
||||
return false;
|
||||
}
|
||||
|
||||
// making sure we have the requested document
|
||||
if (tp.docID() < docId) {
|
||||
// Skipping to requested document
|
||||
if (tp.advance(docId) == DocIdSetIterator.NO_MORE_DOCS) {
|
||||
this.hasMore = false;
|
||||
return false;
|
||||
|
||||
if (curDocID < localDocID) {
|
||||
// look for the document either in that segment, or others
|
||||
while (hasMore && (curDocID = currentDPE.advance(localDocID)) == DocIdSetIterator.NO_MORE_DOCS) {
|
||||
nextSegment(); // also updates curDocID
|
||||
localDocID = docId - curDocBase;
|
||||
// nextSegment advances to nextDoc, so check if we still need to advance
|
||||
if (curDocID >= localDocID) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// If document not found (skipped to much)
|
||||
if (tp.docID() != docId) {
|
||||
|
||||
// we break from the above loop when:
|
||||
// 1. we iterated over all segments (hasMore=false)
|
||||
// 2. current segment advanced to a doc, either requested or higher
|
||||
if (!hasMore || curDocID != localDocID) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Prepare for payload extraction
|
||||
tp.nextPosition();
|
||||
|
||||
BytesRef br = tp.getPayload();
|
||||
|
||||
if (br == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
assert br.length > 0;
|
||||
|
||||
this.payloadLength = br.length;
|
||||
|
||||
if (this.payloadLength > this.buffer.length) {
|
||||
// Growing if necessary.
|
||||
this.buffer = new byte[this.payloadLength * 2 + 1];
|
||||
}
|
||||
// Loading the payload
|
||||
System.arraycopy(br.bytes, br.offset, this.buffer, 0, payloadLength);
|
||||
return true;
|
||||
// we're on the document
|
||||
assert currentDPE.freq() == 1 : "expecting freq=1 (got " + currentDPE.freq() + ") term=" + term + " doc=" + (curDocID + curDocBase);
|
||||
int pos = currentDPE.nextPosition();
|
||||
assert pos != -1 : "no positions for term=" + term + " doc=" + (curDocID + curDocBase);
|
||||
data = currentDPE.getPayload();
|
||||
return data != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the buffer with the content of the last read payload.
|
||||
*/
|
||||
public byte[] getBuffer() {
|
||||
return buffer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the length of the last read payload.
|
||||
*/
|
||||
public int getPayloadLength() {
|
||||
return payloadLength;
|
||||
|
||||
public BytesRef getPayload() {
|
||||
return data;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue