LUCENE-4598: Change PayloadIterator to not use top-level reader API

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1419397 13f79535-47bb-0310-9956-ffa450edef68
2012-12-10 12:58:28 +00:00 · 2012-12-10 12:58:28 +00:00 · dffaebd814
parent e01ed8f674
commit dffaebd814
4 changed files with 94 additions and 84 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -299,6 +299,9 @@ Optimizations
 * LUCENE-4580: DrillDown.query variants return a ConstantScoreQuery with boost set to 0.0f
  so that documents scores are not affected by running a drill-down query. (Shai Erera)  
 * LUCENE-4598: PayloadIterator no longer uses top-level IndexReader to iterate on the
  posting's payload. (Shai Erera, Michael McCandless)
 Documentation
 * LUCENE-4483: Refer to BytesRef.deepCopyOf in Term's constructor that takes BytesRef.
--- a/lucene/facet/src/java/org/apache/lucene/facet/enhancements/EnhancementsPayloadIterator.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/enhancements/EnhancementsPayloadIterator.java
@ -52,12 +52,10 @@ public class EnhancementsPayloadIterator extends PayloadIterator {
   *            The category term to iterate.
   * @throws IOException If there is a low-level I/O error.
   */
-  public EnhancementsPayloadIterator(
+  public EnhancementsPayloadIterator(List<CategoryEnhancement> enhancementsList,
      List<CategoryEnhancement> enhancementsList,
      IndexReader indexReader, Term term) throws IOException {
    super(indexReader, term);
-    EnhancedCategories = enhancementsList
+    EnhancedCategories = enhancementsList.toArray(new CategoryEnhancement[enhancementsList.size()]);
        .toArray(new CategoryEnhancement[enhancementsList.size()]);
    enhancementLength = new int[EnhancedCategories.length];
    enhancementStart = new int[EnhancedCategories.length];
  }
@ -69,10 +67,10 @@ public class EnhancementsPayloadIterator extends PayloadIterator {
    }
    // read header - number of enhancements and their lengths
-    Position position = new Position();
+    Position position = new Position(data.offset);
-    nEnhancements = Vint8.decode(buffer, position);
+    nEnhancements = Vint8.decode(data.bytes, position);
    for (int i = 0; i < nEnhancements; i++) {
-      enhancementLength[i] = Vint8.decode(buffer, position);
+      enhancementLength[i] = Vint8.decode(data.bytes, position);
    }
    // set enhancements start points
@ -96,7 +94,7 @@ public class EnhancementsPayloadIterator extends PayloadIterator {
  public Object getCategoryData(CategoryEnhancement enhancedCategory) {
    for (int i = 0; i < nEnhancements; i++) {
      if (enhancedCategory.equals(EnhancedCategories[i])) {
-        return enhancedCategory.extractCategoryTokenData(buffer,
+        return enhancedCategory.extractCategoryTokenData(data.bytes,
            enhancementStart[i], enhancementLength[i]);
      }
    }
--- a/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIntDecodingIterator.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIntDecodingIterator.java
@ -4,7 +4,7 @@ import java.io.IOException;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
-
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.UnsafeByteArrayInputStream;
 import org.apache.lucene.util.encoding.IntDecoder;
@ -61,14 +61,8 @@ public class PayloadIntDecodingIterator implements CategoryListIterator {
  private final PayloadIterator pi;
  private final int hashCode;
-  public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder)
+  public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder) throws IOException {
-      throws IOException {
+    pi = new PayloadIterator(indexReader, term);
    this(indexReader, term, decoder, new byte[1024]);
  }
  public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder,
                                    byte[] buffer) throws IOException {
    pi = new PayloadIterator(indexReader, term, buffer);
    ubais = new UnsafeByteArrayInputStream();
    this.decoder = decoder;
    hashCode = indexReader.hashCode() ^ term.hashCode();
@ -95,21 +89,25 @@ public class PayloadIntDecodingIterator implements CategoryListIterator {
    return hashCode;
  }
  @Override
  public boolean init() throws IOException {
    return pi.init();
  }
  @Override
  public long nextCategory() throws IOException {
    return decoder.decode();
  }
  @Override
  public boolean skipTo(int docId) throws IOException {
    if (!pi.setdoc(docId)) {
      return false;
    }
    // Initializing the decoding mechanism with the new payload data
-    ubais.reInit(pi.getBuffer(), 0, pi.getPayloadLength());
+    BytesRef data = pi.getPayload();
    ubais.reInit(data.bytes, data.offset, data.length + data.offset);
    decoder.reInit(ubais);
    return true;
  }
--- a/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java
@ -1,13 +1,16 @@
 package org.apache.lucene.facet.search;
 import java.io.IOException;
 import java.util.Iterator;
 import org.apache.lucene.index.AtomicReaderContext;
 import org.apache.lucene.index.DocsAndPositionsEnum;
 import org.apache.lucene.index.Fields;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.MultiFields;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 /*
@ -29,40 +32,63 @@ import org.apache.lucene.util.BytesRef;
 /**
 * A utility class for iterating through a posting list of a given term and
- * retrieving the payload of the first occurrence in every document. Comes with
+ * retrieving the payload of the first position in every document. For
- * its own working space (buffer).
+ * efficiency, this class does not check if documents passed to
 * {@link #setdoc(int)} are deleted, since it is usually used to iterate on
 * payloads of documents that matched a query. If you need to skip over deleted
 * documents, you should do so before calling {@link #setdoc(int)}.
 * 
 * @lucene.experimental
 */
 public class PayloadIterator {
-  protected byte[] buffer;
+  protected BytesRef data;
  protected int payloadLength;
  DocsAndPositionsEnum tp;
  private TermsEnum reuseTE;
  private DocsAndPositionsEnum currentDPE;
  private boolean hasMore;
  private int curDocID, curDocBase;
  private final Iterator<AtomicReaderContext> leaves;
  private final Term term;
-  public PayloadIterator(IndexReader indexReader, Term term)
+  public PayloadIterator(IndexReader indexReader, Term term) throws IOException {
-      throws IOException {
+    leaves = indexReader.leaves().iterator();
-    this(indexReader, term, new byte[1024]);
+    this.term = term;
  }
-  public PayloadIterator(IndexReader indexReader, Term term, byte[] buffer)
+  private void nextSegment() throws IOException {
-      throws IOException {
+    hasMore = false;
-    this.buffer = buffer;
+    while (leaves.hasNext()) {
-    // TODO (Facet): avoid Multi*?
+      AtomicReaderContext ctx = leaves.next();
-    Bits liveDocs = MultiFields.getLiveDocs(indexReader);
+      curDocBase = ctx.docBase;
-    this.tp = MultiFields.getTermPositionsEnum(indexReader, liveDocs, term.field(), term.bytes(), DocsAndPositionsEnum.FLAG_PAYLOADS);
+      Fields fields = ctx.reader().fields();
      if (fields != null) {
        Terms terms = fields.terms(term.field());
        if (terms != null) {
          reuseTE = terms.iterator(reuseTE);
          if (reuseTE.seekExact(term.bytes(), true)) {
            // this class is usually used to iterate on whatever a Query matched
            // if it didn't match deleted documents, we won't receive them. if it
            // did, we should iterate on them too, therefore we pass liveDocs=null
            currentDPE = reuseTE.docsAndPositions(null, currentDPE, DocsAndPositionsEnum.FLAG_PAYLOADS);
            if (currentDPE != null && (curDocID = currentDPE.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
              hasMore = true;
              break;
            }
          }
        }
      }
    }
  }
-
+  
  /**
-   * (re)initialize the iterator. Should be done before the first call to
+   * Initialize the iterator. Should be done before the first call to
-   * {@link #setdoc(int)}. Returns false if there is no category list found
+   * {@link #setdoc(int)}. Returns {@code false} if no category list is found,
-   * (no setdoc() will never return true).
+   * or the category list has no documents.
   */
  public boolean init() throws IOException {
-    hasMore = tp != null && tp.nextDoc() != DocIdSetIterator.NO_MORE_DOCS;
+    nextSegment();
    return hasMore;
  }
@ -77,59 +103,44 @@ public class PayloadIterator {
    if (!hasMore) {
      return false;
    }
    // re-basing docId->localDocID is done fewer times than currentDoc->globalDoc
    int localDocID = docId - curDocBase;
-    if (tp.docID() > docId) {
+    if (curDocID > localDocID) {
      // document does not exist
      return false;
    }
-
+    
-    // making sure we have the requested document
+    if (curDocID < localDocID) {
-    if (tp.docID() < docId) {
+      // look for the document either in that segment, or others
-      // Skipping to requested document
+      while (hasMore && (curDocID = currentDPE.advance(localDocID)) == DocIdSetIterator.NO_MORE_DOCS) {
-      if (tp.advance(docId) == DocIdSetIterator.NO_MORE_DOCS) {
+        nextSegment(); // also updates curDocID
-        this.hasMore = false;
+        localDocID = docId - curDocBase;
-        return false;
+        // nextSegment advances to nextDoc, so check if we still need to advance
        if (curDocID >= localDocID) {
          break;
        }
      }
-
+      
-      // If document not found (skipped to much)
+      // we break from the above loop when:
-      if (tp.docID() != docId) {
+      // 1. we iterated over all segments (hasMore=false)
      // 2. current segment advanced to a doc, either requested or higher
      if (!hasMore || curDocID != localDocID) {
        return false;
      }
    }
-    // Prepare for payload extraction
+    // we're on the document
-    tp.nextPosition();
+    assert currentDPE.freq() == 1 : "expecting freq=1 (got " + currentDPE.freq() + ") term=" + term + " doc=" + (curDocID + curDocBase);
-
+    int pos = currentDPE.nextPosition();
-    BytesRef br = tp.getPayload();
+    assert pos != -1 : "no positions for term=" + term + " doc=" + (curDocID + curDocBase);
-    
+    data = currentDPE.getPayload();
-    if (br == null) {
+    return data != null;
      return false;
    }
    assert br.length > 0;
    this.payloadLength = br.length;
    if (this.payloadLength > this.buffer.length) {
      // Growing if necessary.
      this.buffer = new byte[this.payloadLength * 2 + 1];
    }
    // Loading the payload
    System.arraycopy(br.bytes, br.offset, this.buffer, 0, payloadLength);
    return true;
  }
-
+  
-  /**
+  public BytesRef getPayload() {
-   * Get the buffer with the content of the last read payload.
+    return data;
   */
  public byte[] getBuffer() {
    return buffer;
  }
  /**
   * Get the length of the last read payload.
   */
  public int getPayloadLength() {
    return payloadLength;
  }
 }