mirror of https://github.com/apache/lucene.git
LUCENE-4598: Change PayloadIterator to not use top-level reader API
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1419397 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e01ed8f674
commit
dffaebd814
|
@ -299,6 +299,9 @@ Optimizations
|
||||||
* LUCENE-4580: DrillDown.query variants return a ConstantScoreQuery with boost set to 0.0f
|
* LUCENE-4580: DrillDown.query variants return a ConstantScoreQuery with boost set to 0.0f
|
||||||
so that documents scores are not affected by running a drill-down query. (Shai Erera)
|
so that documents scores are not affected by running a drill-down query. (Shai Erera)
|
||||||
|
|
||||||
|
* LUCENE-4598: PayloadIterator no longer uses top-level IndexReader to iterate on the
|
||||||
|
posting's payload. (Shai Erera, Michael McCandless)
|
||||||
|
|
||||||
Documentation
|
Documentation
|
||||||
|
|
||||||
* LUCENE-4483: Refer to BytesRef.deepCopyOf in Term's constructor that takes BytesRef.
|
* LUCENE-4483: Refer to BytesRef.deepCopyOf in Term's constructor that takes BytesRef.
|
||||||
|
|
|
@ -52,12 +52,10 @@ public class EnhancementsPayloadIterator extends PayloadIterator {
|
||||||
* The category term to iterate.
|
* The category term to iterate.
|
||||||
* @throws IOException If there is a low-level I/O error.
|
* @throws IOException If there is a low-level I/O error.
|
||||||
*/
|
*/
|
||||||
public EnhancementsPayloadIterator(
|
public EnhancementsPayloadIterator(List<CategoryEnhancement> enhancementsList,
|
||||||
List<CategoryEnhancement> enhancementsList,
|
|
||||||
IndexReader indexReader, Term term) throws IOException {
|
IndexReader indexReader, Term term) throws IOException {
|
||||||
super(indexReader, term);
|
super(indexReader, term);
|
||||||
EnhancedCategories = enhancementsList
|
EnhancedCategories = enhancementsList.toArray(new CategoryEnhancement[enhancementsList.size()]);
|
||||||
.toArray(new CategoryEnhancement[enhancementsList.size()]);
|
|
||||||
enhancementLength = new int[EnhancedCategories.length];
|
enhancementLength = new int[EnhancedCategories.length];
|
||||||
enhancementStart = new int[EnhancedCategories.length];
|
enhancementStart = new int[EnhancedCategories.length];
|
||||||
}
|
}
|
||||||
|
@ -69,10 +67,10 @@ public class EnhancementsPayloadIterator extends PayloadIterator {
|
||||||
}
|
}
|
||||||
|
|
||||||
// read header - number of enhancements and their lengths
|
// read header - number of enhancements and their lengths
|
||||||
Position position = new Position();
|
Position position = new Position(data.offset);
|
||||||
nEnhancements = Vint8.decode(buffer, position);
|
nEnhancements = Vint8.decode(data.bytes, position);
|
||||||
for (int i = 0; i < nEnhancements; i++) {
|
for (int i = 0; i < nEnhancements; i++) {
|
||||||
enhancementLength[i] = Vint8.decode(buffer, position);
|
enhancementLength[i] = Vint8.decode(data.bytes, position);
|
||||||
}
|
}
|
||||||
|
|
||||||
// set enhancements start points
|
// set enhancements start points
|
||||||
|
@ -96,7 +94,7 @@ public class EnhancementsPayloadIterator extends PayloadIterator {
|
||||||
public Object getCategoryData(CategoryEnhancement enhancedCategory) {
|
public Object getCategoryData(CategoryEnhancement enhancedCategory) {
|
||||||
for (int i = 0; i < nEnhancements; i++) {
|
for (int i = 0; i < nEnhancements; i++) {
|
||||||
if (enhancedCategory.equals(EnhancedCategories[i])) {
|
if (enhancedCategory.equals(EnhancedCategories[i])) {
|
||||||
return enhancedCategory.extractCategoryTokenData(buffer,
|
return enhancedCategory.extractCategoryTokenData(data.bytes,
|
||||||
enhancementStart[i], enhancementLength[i]);
|
enhancementStart[i], enhancementLength[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,7 +4,7 @@ import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.UnsafeByteArrayInputStream;
|
import org.apache.lucene.util.UnsafeByteArrayInputStream;
|
||||||
import org.apache.lucene.util.encoding.IntDecoder;
|
import org.apache.lucene.util.encoding.IntDecoder;
|
||||||
|
|
||||||
|
@ -61,14 +61,8 @@ public class PayloadIntDecodingIterator implements CategoryListIterator {
|
||||||
private final PayloadIterator pi;
|
private final PayloadIterator pi;
|
||||||
private final int hashCode;
|
private final int hashCode;
|
||||||
|
|
||||||
public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder)
|
public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder) throws IOException {
|
||||||
throws IOException {
|
pi = new PayloadIterator(indexReader, term);
|
||||||
this(indexReader, term, decoder, new byte[1024]);
|
|
||||||
}
|
|
||||||
|
|
||||||
public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder,
|
|
||||||
byte[] buffer) throws IOException {
|
|
||||||
pi = new PayloadIterator(indexReader, term, buffer);
|
|
||||||
ubais = new UnsafeByteArrayInputStream();
|
ubais = new UnsafeByteArrayInputStream();
|
||||||
this.decoder = decoder;
|
this.decoder = decoder;
|
||||||
hashCode = indexReader.hashCode() ^ term.hashCode();
|
hashCode = indexReader.hashCode() ^ term.hashCode();
|
||||||
|
@ -95,21 +89,25 @@ public class PayloadIntDecodingIterator implements CategoryListIterator {
|
||||||
return hashCode;
|
return hashCode;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
public boolean init() throws IOException {
|
public boolean init() throws IOException {
|
||||||
return pi.init();
|
return pi.init();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
public long nextCategory() throws IOException {
|
public long nextCategory() throws IOException {
|
||||||
return decoder.decode();
|
return decoder.decode();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
public boolean skipTo(int docId) throws IOException {
|
public boolean skipTo(int docId) throws IOException {
|
||||||
if (!pi.setdoc(docId)) {
|
if (!pi.setdoc(docId)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Initializing the decoding mechanism with the new payload data
|
// Initializing the decoding mechanism with the new payload data
|
||||||
ubais.reInit(pi.getBuffer(), 0, pi.getPayloadLength());
|
BytesRef data = pi.getPayload();
|
||||||
|
ubais.reInit(data.bytes, data.offset, data.length + data.offset);
|
||||||
decoder.reInit(ubais);
|
decoder.reInit(ubais);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,13 +1,16 @@
|
||||||
package org.apache.lucene.facet.search;
|
package org.apache.lucene.facet.search;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.AtomicReaderContext;
|
||||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||||
|
import org.apache.lucene.index.Fields;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.MultiFields;
|
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.index.Terms;
|
||||||
|
import org.apache.lucene.index.TermsEnum;
|
||||||
import org.apache.lucene.search.DocIdSetIterator;
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
import org.apache.lucene.util.Bits;
|
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -29,40 +32,63 @@ import org.apache.lucene.util.BytesRef;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A utility class for iterating through a posting list of a given term and
|
* A utility class for iterating through a posting list of a given term and
|
||||||
* retrieving the payload of the first occurrence in every document. Comes with
|
* retrieving the payload of the first position in every document. For
|
||||||
* its own working space (buffer).
|
* efficiency, this class does not check if documents passed to
|
||||||
|
* {@link #setdoc(int)} are deleted, since it is usually used to iterate on
|
||||||
|
* payloads of documents that matched a query. If you need to skip over deleted
|
||||||
|
* documents, you should do so before calling {@link #setdoc(int)}.
|
||||||
*
|
*
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public class PayloadIterator {
|
public class PayloadIterator {
|
||||||
|
|
||||||
protected byte[] buffer;
|
protected BytesRef data;
|
||||||
protected int payloadLength;
|
|
||||||
|
|
||||||
DocsAndPositionsEnum tp;
|
|
||||||
|
|
||||||
|
private TermsEnum reuseTE;
|
||||||
|
private DocsAndPositionsEnum currentDPE;
|
||||||
private boolean hasMore;
|
private boolean hasMore;
|
||||||
|
private int curDocID, curDocBase;
|
||||||
|
|
||||||
|
private final Iterator<AtomicReaderContext> leaves;
|
||||||
|
private final Term term;
|
||||||
|
|
||||||
public PayloadIterator(IndexReader indexReader, Term term)
|
public PayloadIterator(IndexReader indexReader, Term term) throws IOException {
|
||||||
throws IOException {
|
leaves = indexReader.leaves().iterator();
|
||||||
this(indexReader, term, new byte[1024]);
|
this.term = term;
|
||||||
}
|
}
|
||||||
|
|
||||||
public PayloadIterator(IndexReader indexReader, Term term, byte[] buffer)
|
private void nextSegment() throws IOException {
|
||||||
throws IOException {
|
hasMore = false;
|
||||||
this.buffer = buffer;
|
while (leaves.hasNext()) {
|
||||||
// TODO (Facet): avoid Multi*?
|
AtomicReaderContext ctx = leaves.next();
|
||||||
Bits liveDocs = MultiFields.getLiveDocs(indexReader);
|
curDocBase = ctx.docBase;
|
||||||
this.tp = MultiFields.getTermPositionsEnum(indexReader, liveDocs, term.field(), term.bytes(), DocsAndPositionsEnum.FLAG_PAYLOADS);
|
Fields fields = ctx.reader().fields();
|
||||||
|
if (fields != null) {
|
||||||
|
Terms terms = fields.terms(term.field());
|
||||||
|
if (terms != null) {
|
||||||
|
reuseTE = terms.iterator(reuseTE);
|
||||||
|
if (reuseTE.seekExact(term.bytes(), true)) {
|
||||||
|
// this class is usually used to iterate on whatever a Query matched
|
||||||
|
// if it didn't match deleted documents, we won't receive them. if it
|
||||||
|
// did, we should iterate on them too, therefore we pass liveDocs=null
|
||||||
|
currentDPE = reuseTE.docsAndPositions(null, currentDPE, DocsAndPositionsEnum.FLAG_PAYLOADS);
|
||||||
|
if (currentDPE != null && (curDocID = currentDPE.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||||
|
hasMore = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* (re)initialize the iterator. Should be done before the first call to
|
* Initialize the iterator. Should be done before the first call to
|
||||||
* {@link #setdoc(int)}. Returns false if there is no category list found
|
* {@link #setdoc(int)}. Returns {@code false} if no category list is found,
|
||||||
* (no setdoc() will never return true).
|
* or the category list has no documents.
|
||||||
*/
|
*/
|
||||||
public boolean init() throws IOException {
|
public boolean init() throws IOException {
|
||||||
hasMore = tp != null && tp.nextDoc() != DocIdSetIterator.NO_MORE_DOCS;
|
nextSegment();
|
||||||
return hasMore;
|
return hasMore;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -77,59 +103,44 @@ public class PayloadIterator {
|
||||||
if (!hasMore) {
|
if (!hasMore) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// re-basing docId->localDocID is done fewer times than currentDoc->globalDoc
|
||||||
|
int localDocID = docId - curDocBase;
|
||||||
|
|
||||||
if (tp.docID() > docId) {
|
if (curDocID > localDocID) {
|
||||||
|
// document does not exist
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// making sure we have the requested document
|
if (curDocID < localDocID) {
|
||||||
if (tp.docID() < docId) {
|
// look for the document either in that segment, or others
|
||||||
// Skipping to requested document
|
while (hasMore && (curDocID = currentDPE.advance(localDocID)) == DocIdSetIterator.NO_MORE_DOCS) {
|
||||||
if (tp.advance(docId) == DocIdSetIterator.NO_MORE_DOCS) {
|
nextSegment(); // also updates curDocID
|
||||||
this.hasMore = false;
|
localDocID = docId - curDocBase;
|
||||||
return false;
|
// nextSegment advances to nextDoc, so check if we still need to advance
|
||||||
|
if (curDocID >= localDocID) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// If document not found (skipped to much)
|
// we break from the above loop when:
|
||||||
if (tp.docID() != docId) {
|
// 1. we iterated over all segments (hasMore=false)
|
||||||
|
// 2. current segment advanced to a doc, either requested or higher
|
||||||
|
if (!hasMore || curDocID != localDocID) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Prepare for payload extraction
|
// we're on the document
|
||||||
tp.nextPosition();
|
assert currentDPE.freq() == 1 : "expecting freq=1 (got " + currentDPE.freq() + ") term=" + term + " doc=" + (curDocID + curDocBase);
|
||||||
|
int pos = currentDPE.nextPosition();
|
||||||
BytesRef br = tp.getPayload();
|
assert pos != -1 : "no positions for term=" + term + " doc=" + (curDocID + curDocBase);
|
||||||
|
data = currentDPE.getPayload();
|
||||||
if (br == null) {
|
return data != null;
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
assert br.length > 0;
|
|
||||||
|
|
||||||
this.payloadLength = br.length;
|
|
||||||
|
|
||||||
if (this.payloadLength > this.buffer.length) {
|
|
||||||
// Growing if necessary.
|
|
||||||
this.buffer = new byte[this.payloadLength * 2 + 1];
|
|
||||||
}
|
|
||||||
// Loading the payload
|
|
||||||
System.arraycopy(br.bytes, br.offset, this.buffer, 0, payloadLength);
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
public BytesRef getPayload() {
|
||||||
* Get the buffer with the content of the last read payload.
|
return data;
|
||||||
*/
|
|
||||||
public byte[] getBuffer() {
|
|
||||||
return buffer;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get the length of the last read payload.
|
|
||||||
*/
|
|
||||||
public int getPayloadLength() {
|
|
||||||
return payloadLength;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue