LUCENE-2380: add getTermsEnum for FieldCache entry and use it for fieldcache merging in Solr faceting

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@954904 13f79535-47bb-0310-9956-ffa450edef68
2025-02-16 15:06:41 +00:00 · 2010-06-15 14:15:58 +00:00 · 2010-06-15 14:15:58 +00:00 · 98b1313d34
commit 98b1313d34
parent 3915a393f3
6 changed files with 167 additions and 16 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -79,7 +79,7 @@ Changes in backwards compatibility policy
  (getTerms, getTermsIndex).  Also, the sort values (returned in
  FieldDoc.fields) when sorting by SortField.STRING or
  SortField.STRING_VAL are now BytesRef instances.  See MIGRATE.txt
-  for more details. (Mike McCandless)
+  for more details. (yonik, Mike McCandless)
 
 * LUCENE-2480: Though not a change in backwards compatibility policy, pre-3.0 
  indexes are no longer supported. You should upgrade to 3.x first, then run
--- a/lucene/src/java/org/apache/lucene/search/FieldCache.java
+++ b/lucene/src/java/org/apache/lucene/search/FieldCache.java
@ -18,6 +18,7 @@ package org.apache.lucene.search;
 */

 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.util.NumericUtils;
 import org.apache.lucene.util.RamUsageEstimator;
 import org.apache.lucene.util.BytesRef;
@ -530,6 +531,9 @@ public interface FieldCache {

    /** Number of documents */
    public abstract int size();
+
+    /** Returns a TermsEnum that can iterate over the values in this index entry */
+    public abstract TermsEnum getTermsEnum();
  }

  /** Checks the internal cache for an appropriate entry, and if none
--- a/lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java
+++ b/lucene/src/java/org/apache/lucene/search/FieldCacheImpl.java
@ -19,17 +19,9 @@ package org.apache.lucene.search;

 import java.io.IOException;
 import java.io.PrintStream;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.WeakHashMap;
+import java.util.*;

-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.DocsEnum;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.MultiFields;
-import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.index.*;
 import org.apache.lucene.util.PagedBytes;
 import org.apache.lucene.util.packed.PackedInts;
 import org.apache.lucene.util.packed.GrowableWriter;
@ -674,6 +666,105 @@ class FieldCacheImpl implements FieldCache {
    public BytesRef lookup(int ord, BytesRef ret) {
      return bytes.fillUsingLengthPrefix(ret, termOrdToBytesOffset.get(ord));
    }
+
+    @Override
+    public TermsEnum getTermsEnum() {
+      return this.new DocTermsIndexEnum();
+    }
+
+    class DocTermsIndexEnum extends TermsEnum {
+      int currentOrd;
+      int currentBlockNumber;
+      int end;  // end position in the current block
+      final byte[][] blocks;
+      final int[] blockEnds;
+
+      final BytesRef term = new BytesRef();
+
+      public DocTermsIndexEnum() {
+        currentOrd = 0;
+        currentBlockNumber = 0;
+        blocks = bytes.getBlocks();
+        blockEnds = bytes.getBlockEnds();
+        currentBlockNumber = bytes.fillUsingLengthPrefix2(term, termOrdToBytesOffset.get(0));
+        end = blockEnds[currentBlockNumber];
+      }
+
+      @Override
+      public SeekStatus seek(BytesRef text, boolean useCache) throws IOException {
+        // TODO - we can support with binary search
+        throw new UnsupportedOperationException();
+      }
+
+      @Override
+      public SeekStatus seek(long ord) throws IOException {
+        assert(ord >= 0 && ord <= numOrd);
+        // TODO: if gap is small, could iterate from current position?  Or let user decide that?
+        currentBlockNumber = bytes.fillUsingLengthPrefix2(term, termOrdToBytesOffset.get((int)ord));
+        end = blockEnds[currentBlockNumber];
+        currentOrd = (int)ord;
+        return SeekStatus.FOUND;
+      }
+
+      @Override
+      public BytesRef next() throws IOException {
+        int start = term.offset + term.length;
+        if (start >= end) {
+          // switch byte blocks
+          if (currentBlockNumber +1 >= blocks.length) {
+            return null;
+          }
+          currentBlockNumber++;
+          term.bytes = blocks[currentBlockNumber];
+          end = blockEnds[currentBlockNumber];
+          start = 0;
+          if (end<=0) return null;  // special case of empty last array
+        }
+
+        currentOrd++;
+
+        byte[] block = term.bytes;
+        if ((block[start] & 128) == 0) {
+          term.length = block[start];
+          term.offset = start+1;
+        } else {
+          term.length = (((int) (block[start] & 0x7f)) << 8) | (block[1+start] & 0xff);
+          term.offset = start+2;
+        }
+
+        return term;
+      }
+
+      @Override
+      public BytesRef term() throws IOException {
+        return term;
+      }
+
+      @Override
+      public long ord() throws IOException {
+        return currentOrd;
+      }
+
+      @Override
+      public int docFreq() {
+        throw new UnsupportedOperationException();
+      }
+
+      @Override
+      public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
+        throw new UnsupportedOperationException();
+      }
+
+      @Override
+      public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
+        throw new UnsupportedOperationException();
+      }
+
+      @Override
+      public Comparator<BytesRef> getComparator() throws IOException {
+        throw new UnsupportedOperationException();
+      }
+    }
  }

  private static boolean DEFAULT_FASTER_BUT_MORE_RAM = true;
--- a/lucene/src/java/org/apache/lucene/util/PagedBytes.java
+++ b/lucene/src/java/org/apache/lucene/util/PagedBytes.java
@ -32,6 +32,7 @@ import java.io.IOException;
 * <p>@lucene.internal</p>*/
 public final class PagedBytes {
  private final List<byte[]> blocks = new ArrayList<byte[]>();
+  private final List<Integer> blockEnd = new ArrayList<Integer>();
  private final int blockSize;
  private final int blockBits;
  private final int blockMask;
@ -42,6 +43,7 @@ public final class PagedBytes {

  public final static class Reader implements Closeable {
    private final byte[][] blocks;
+    private final int[] blockEnds;
    private final int blockBits;
    private final int blockMask;
    private final int blockSize;
@ -52,6 +54,10 @@ public final class PagedBytes {
      for(int i=0;i<blocks.length;i++) {
        blocks[i] = pagedBytes.blocks.get(i);
      }
+      blockEnds = new int[blocks.length];
+      for(int i=0;i< blockEnds.length;i++) {
+        blockEnds[i] = pagedBytes.blockEnd.get(i);
+      }
      blockBits = pagedBytes.blockBits;
      blockMask = pagedBytes.blockMask;
      blockSize = pagedBytes.blockSize;
@ -102,6 +108,34 @@ public final class PagedBytes {
      return b;
    }

+    /** @lucene.internal  Reads length as 1 or 2 byte vInt prefix, starting @ start.  Returns the block number of the term. */
+    public int fillUsingLengthPrefix2(BytesRef b, long start) {
+      final int index = (int) (start >> blockBits);
+      final int offset = (int) (start & blockMask);
+      final byte[] block = b.bytes = blocks[index];
+
+      if ((block[offset] & 128) == 0) {
+        b.length = block[offset];
+        b.offset = offset+1;
+      } else {
+        b.length = (((int) (block[offset] & 0x7f)) << 8) | (block[1+offset] & 0xff);
+        b.offset = offset+2;
+        assert b.length > 0;
+      }
+      return index;
+    }
+
+
+    /** @lucene.internal */
+    public byte[][] getBlocks() {
+      return blocks;
+    }
+
+    /** @lucene.internal */
+    public int[] getBlockEnds() {
+      return blockEnds;
+    }
+
    public void close() {
      threadBuffers.close();
    }
@ -123,6 +157,7 @@ public final class PagedBytes {
      if (left == 0) {
        if (currentBlock != null) {
          blocks.add(currentBlock);
+          blockEnd.add(upto);
        }
        currentBlock = new byte[blockSize];
        upto = 0;
@ -149,6 +184,7 @@ public final class PagedBytes {
      if (left == 0) {
        if (currentBlock != null) {
          blocks.add(currentBlock);
+          blockEnd.add(upto);          
        }
        currentBlock = new byte[blockSize];
        upto = 0;
@ -178,6 +214,7 @@ public final class PagedBytes {
      currentBlock = EMPTY_BYTES;
    }
    blocks.add(currentBlock);
+    blockEnd.add(upto); 
    currentBlock = null;
    return new Reader(this);
  }
@ -200,6 +237,7 @@ public final class PagedBytes {
      }
      if (currentBlock != null) {
        blocks.add(currentBlock);
+        blockEnd.add(upto);        
      }
      currentBlock = new byte[blockSize];
      upto = 0;
--- a/lucene/src/test/org/apache/lucene/search/TestFieldCache.java
+++ b/lucene/src/test/org/apache/lucene/search/TestFieldCache.java
@ -22,6 +22,7 @@ import org.apache.lucene.document.Field;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.store.RAMDirectory;
 import org.apache.lucene.store.MockRAMDirectory;
 import org.apache.lucene.store.Directory;
@ -167,6 +168,20 @@ public class TestFieldCache extends LuceneTestCase {
      final String s = term == null ? null : term.utf8ToString();
      assertTrue("for doc " + i + ": " + s + " does not equal: " + unicodeStrings[i], unicodeStrings[i] == null || unicodeStrings[i].equals(s));
    }
+
+    int nTerms = termsIndex.numOrd();
+    // System.out.println("nTerms="+nTerms);
+
+    TermsEnum tenum = termsIndex.getTermsEnum();
+    BytesRef val = new BytesRef();
+    for (int i=1; i<nTerms; i++) {
+      BytesRef val1 = tenum.next();
+      BytesRef val2 = termsIndex.lookup(i,val);
+      // System.out.println("i="+i);
+      assertEquals(val2, val1);
+    }
+
+
    // test bad field
    termsIndex = cache.getTermsIndex(reader, "bogusfield");

--- a/solr/src/java/org/apache/solr/request/PerSegmentSingleValuedFaceting.java
+++ b/solr/src/java/org/apache/solr/request/PerSegmentSingleValuedFaceting.java
@ -1,5 +1,6 @@
 package org.apache.solr.request;

+import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.DocIdSet;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.FieldCache;
@ -137,7 +138,9 @@ class PerSegmentSingleValuedFaceting {
          seg.pos = seg.startTermIndex;
        }
        if (seg.pos < seg.endTermIndex) {
-          seg.si.lookup(seg.pos, seg.tempBR);
+          seg.tenum = seg.si.getTermsEnum();          
+          seg.tenum.seek(seg.pos);
+          seg.tempBR = seg.tenum.term();
          queue.add(seg);
        }
      }
@ -156,7 +159,6 @@ class PerSegmentSingleValuedFaceting {
      SegFacet seg = queue.top();

      // make a shallow copy
-      // Is this always safe? Or could the byte[] be changed?
      val.bytes = seg.tempBR.bytes;
      val.offset = seg.tempBR.offset;
      val.length = seg.tempBR.length;
@ -173,7 +175,7 @@ class PerSegmentSingleValuedFaceting {
          queue.pop();
          seg = queue.top();
        }  else {
-          seg.si.lookup(seg.pos, seg.tempBR);          
+          seg.tempBR = seg.tenum.next();
          seg = queue.updateTop();
        }
      } while (seg != null && val.compareTo(seg.tempBR) == 0);
@ -215,9 +217,10 @@ class PerSegmentSingleValuedFaceting {
    int endTermIndex;
    int[] counts;

-    int pos; // only used during merge with other segments
+    int pos; // only used when merging
+    TermsEnum tenum; // only used when merging

-    final BytesRef tempBR = new BytesRef();
+    BytesRef tempBR = new BytesRef();

    void countTerms() throws IOException {
      si = FieldCache.DEFAULT.getTermsIndex(reader, fieldName);