SOLR-475:multi-valued faceting via un-inverted field

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@720403 13f79535-47bb-0310-9956-ffa450edef68
2008-11-25 04:02:09 +00:00 · 2008-11-25 04:02:09 +00:00 · 156491848a
parent b3a2445d6b
commit 156491848a
7 changed files with 1198 additions and 10 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -96,6 +96,11 @@ Optimizations
 2. SOLR-808: Write string keys in Maps as extern strings in the javabin format. (Noble Paul via shalin)
 3. SOLR-475: New faceting method with better performance and smaller memory usage for
    multi-valued fields with many unique values but relatively few values per document.
    Controllable via the facet.method parameter - "fc" is the new default method and "enum"
    is the original method.  (yonik)
 Bug Fixes
 ----------------------
--- a/src/java/org/apache/solr/common/params/FacetParams.java
+++ b/src/java/org/apache/solr/common/params/FacetParams.java
@ -29,6 +29,20 @@ public interface FacetParams {
   * Should facet counts be calculated?
   */
  public static final String FACET = "facet";
  /** What method should be used to do the faceting */
  public static final String FACET_METHOD = FACET + ".method";
  /** Value for FACET_METHOD param to indicate that Solr should enumerate over terms
   * in a field to calculate the facet counts.
   */
  public static final String FACET_METHOD_enum = "enum";
  /** Value for FACET_METHOD param to indicate that Solr should enumerate over documents
   * and count up terms by consulting an uninverted representation of the field values
   * (such as the FieldCache used for sorting).
   */
  public static final String FACET_METHOD_fc = "fc";
  /**
   * Any lucene formated queries the user would like to use for
--- a/src/java/org/apache/solr/request/SimpleFacets.java
+++ b/src/java/org/apache/solr/request/SimpleFacets.java
@ -148,16 +148,32 @@ public class SimpleFacets {
    boolean sort = params.getFieldBool(field, FacetParams.FACET_SORT, limit>0);
    String prefix = params.getFieldParam(field,FacetParams.FACET_PREFIX);
    NamedList counts;
    SchemaField sf = searcher.getSchema().getField(field);
    FieldType ft = sf.getType();
-    if (sf.multiValued() || ft.isTokenized() || ft instanceof BoolField) {
+
    // determine what type of faceting method to use
    String method = params.getFieldParam(field, FacetParams.FACET_METHOD);
    boolean enumMethod = FacetParams.FACET_METHOD_enum.equals(method);
    if (method == null && ft instanceof BoolField) {
      // Always use filters for booleans... we know the number of values is very small.
      enumMethod = true;
    }
    boolean multiToken = sf.multiValued() || ft.isTokenized();
    // unless the enum method is explicitly specified, use a counting method.
    if (enumMethod) {
      counts = getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount,missing,sort,prefix);
    } else {
-      // TODO: future logic could use filters instead of the fieldcache if
+      if (multiToken) {
-      // the number of terms in the field is small enough.
+        UnInvertedField uif = UnInvertedField.getUnInvertedField(field, searcher);
-      counts = getFieldCacheCounts(searcher, docs, field, offset,limit, mincount, missing, sort, prefix);
+        counts = uif.getCounts(searcher, docs, offset, limit, mincount,missing,sort,prefix);
      } else {
        // TODO: future logic could use filters instead of the fieldcache if
        // the number of terms in the field is small enough.
        counts = getFieldCacheCounts(searcher, docs, field, offset,limit, mincount, missing, sort, prefix);
      }
    }
    return counts;
--- a/src/java/org/apache/solr/request/UnInvertedField.java
+++ b/src/java/org/apache/solr/request/UnInvertedField.java
@ -0,0 +1,908 @@
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.solr.request;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermDocs;
 import org.apache.lucene.index.TermEnum;
 import org.apache.lucene.search.TermQuery;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.core.SolrCore;
 import org.apache.solr.request.SimpleFacets;
 import org.apache.solr.schema.FieldType;
 import org.apache.solr.search.BitDocSet;
 import org.apache.solr.search.DocIterator;
 import org.apache.solr.search.DocSet;
 import org.apache.solr.search.SolrIndexSearcher;
 import org.apache.solr.util.BoundedTreeSet;
 import org.apache.solr.util.OpenBitSet;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.LinkedHashMap;
 import java.util.Map;
 import java.util.WeakHashMap;
 /**
 *
 * Final form of the un-inverted field:
 *   Each document points to a list of term numbers that are contained in that document.
 *
 *   Term numbers are in sorted order, and are encoded as variable-length deltas from the
 *   previous term number.  Real term numbers start at 2 since 0 and 1 are reserved.  A
 *   term number of 0 signals the end of the termNumber list.
 *
 *   There is a singe int[maxDoc()] which either contains a pointer into a byte[] for
 *   the termNumber lists, or directly contains the termNumber list if it fits in the 4
 *   bytes of an integer.  If the first byte in the integer is 1, the next 3 bytes
 *   are a pointer into a byte[] where the termNumber list starts.
 *
 *   There are actually 256 byte arrays, to compensate for the fact that the pointers
 *   into the byte arrays are only 3 bytes long.  The correct byte array for a document
 *   is a function of it's id.
 *
 *   To save space and speed up faceting, any term that matches enough documents will
 *   not be un-inverted... it will be skipped while building the un-inverted field structure,
 *   and will use a set intersection method during faceting.
 *
 *   To further save memory, the terms (the actual string values) are not all stored in
 *   memory, but a TermIndex is used to convert term numbers to term values only
 *   for the terms needed after faceting has completed.  Only every 128th term value
 *   is stored, along with it's corresponding term number, and this is used as an
 *   index to find the closest term and iterate until the desired number is hit (very
 *   much like Lucene's own internal term index).
 *
 */
 class UnInvertedField {
  private static int TNUM_OFFSET=2;
  static class TopTerm {
    Term term;
    int termNum;
    long memSize() {
      return 8 +   // obj header
             8 + 8 +(term.text().length()<<1) +  //term
             4;    // int
    }
  }
  String field;
  int numTermsInField;
  int termsInverted;  // number of unique terms that were un-inverted
  long termInstances; // total number of references to term numbers
  final TermIndex ti;
  int[] index;
  byte[][] tnums = new byte[256][];
  int[] maxTermCounts;
  final Map<Integer,TopTerm> bigTerms = new LinkedHashMap<Integer,TopTerm>();
  public long memSize() {
    long sz = 6*8 + 12; // local fields
    sz += bigTerms.size() * 64;
    for (TopTerm tt : bigTerms.values()) {
      sz += tt.memSize();
    }
    if (index != null) sz += index.length * 4;
    if (tnums!=null) {
      for (byte[] arr : tnums)
        if (arr != null) sz += arr.length;
    }
    if (maxTermCounts != null)
      sz += maxTermCounts.length * 4;
    sz += ti.memSize();
    return sz;
  }
  /** Number of bytes to represent an unsigned int as a vint. */
  static int vIntSize(int x) {
    if ((x & (0xffffffff << (7*1))) == 0 ) {
      return 1;
    }
    if ((x & (0xffffffff << (7*2))) == 0 ) {
      return 2;
    }
    if ((x & (0xffffffff << (7*3))) == 0 ) {
      return 3;
    }
    if ((x & (0xffffffff << (7*4))) == 0 ) {
      return 4;
    }
    return 5;
  }
  // todo: if we know the size of the vInt already, we could do
  // a single switch on the size
  static int writeInt(int x, byte[] arr, int pos) {
    int a;
    a = (x >>> (7*4));
    if (a != 0) {
      arr[pos++] = (byte)(a | 0x80);
    }
    a = (x >>> (7*3));
    if (a != 0) {
      arr[pos++] = (byte)(a | 0x80);
    }
    a = (x >>> (7*2));
    if (a != 0) {
      arr[pos++] = (byte)(a | 0x80);
    }
    a = (x >>> (7*1));
    if (a != 0) {
      arr[pos++] = (byte)(a | 0x80);
    }
    arr[pos++] = (byte)(x & 0x7f);
    return pos;
  }
  public UnInvertedField(String field, SolrIndexSearcher searcher) throws IOException {
    this.field = field;
    this.ti = new TermIndex(field);
    uninvert(searcher);
  }
  private void uninvert(SolrIndexSearcher searcher) throws IOException {
    long startTime = System.currentTimeMillis();
    IndexReader reader = searcher.getReader();
    int maxDoc = reader.maxDoc();
    int[] index = new int[maxDoc];       // immediate term numbers, or the index into the byte[] representing the last number
    this.index = index;
    final int[] lastTerm = new int[maxDoc];    // last term we saw for this document
    final byte[][] bytes = new byte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts)
    maxTermCounts = new int[1024];
    NumberedTermEnum te = ti.getEnumerator(reader);
    // threshold, over which we use set intersections instead of counting
    // to (1) save memory, and (2) speed up faceting.
    // Add 2 for testing purposes so that there will always be some terms under
    // the threshold even when the index is very small.
    int threshold = maxDoc / 20 + 2;
    // threshold = 2000000000; //////////////////////////////// USE FOR TESTING
    int[] docs = new int[1000];
    int[] freqs = new int[1000];
    // we need a minimum of 9 bytes, but round up to 12 since the space would
    // be wasted with most allocators anyway.
    byte[] tempArr = new byte[12];
    //
    // enumerate all terms, and build an intermediate form of the un-inverted field.
    //
    // During this intermediate form, every document has a (potential) byte[]
    // and the int[maxDoc()] array either contains the termNumber list directly
    // or the *end* offset of the termNumber list in it's byte array (for faster
    // appending and faster creation of the final form).
    //
    // idea... if things are too large while building, we could do a range of docs
    // at a time (but it would be a fair amount slower to build)
    // could also do ranges in parallel to take advantage of multiple CPUs
    // OPTIONAL: remap the largest df terms to the lowest 128 (single byte)
    // values.  This requires going over the field first to find the most
    // frequent terms ahead of time.
    for (;;) {
      Term t = te.term();
      if (t==null) break;
      int termNum = te.getTermNumber();
      if (termNum >= maxTermCounts.length) {
        // resize, but conserve memory by not doubling
        // resize at end??? we waste a maximum of 16K (average of 8K)
        int[] newMaxTermCounts = new int[maxTermCounts.length+4096];
        System.arraycopy(maxTermCounts, 0, newMaxTermCounts, 0, termNum);
        maxTermCounts = newMaxTermCounts;
      }
      int df = te.docFreq();
      if (df >= threshold) {
        TopTerm topTerm = new TopTerm();
        topTerm.term = t;
        topTerm.termNum = termNum;
        bigTerms.put(topTerm.termNum, topTerm);
        DocSet set = searcher.getDocSet(new TermQuery(topTerm.term));
        maxTermCounts[termNum] = set.size();
        te.next();
        continue;
      }
      termsInverted++;
      TermDocs td = te.getTermDocs();
      td.seek(te);
      for(;;) {
        int n = td.read(docs,freqs);
        if (n <= 0) break;
        maxTermCounts[termNum] += n;
        for (int i=0; i<n; i++) {
          termInstances++;
          int doc = docs[i];
          // add 2 to the term number to make room for special reserved values:
          // 0 (end term) and 1 (index into byte array follows)
          int delta = termNum - lastTerm[doc] + TNUM_OFFSET;
          lastTerm[doc] = termNum;
          int val = index[doc];
          if ((val & 0xff)==1) {
            // index into byte array (actually the end of
            // the doc-specific byte[] when building)
            int pos = val >>> 8;
            int ilen = vIntSize(delta);
            byte[] arr = bytes[doc];
            int newend = pos+ilen;
            if (newend > arr.length) {
              // We avoid a doubling strategy to lower memory usage.
              // this faceting method isn't for docs with many terms.
              // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary.
              // TODO: figure out what array lengths we can round up to w/o actually using more memory
              // (how much space does a byte[] take up?  Is data preceded by a 32 bit length only?
              // It should be safe to round up to the nearest 32 bits in any case.
              int newLen = (newend + 3) & 0xfffffffc;  // 4 byte alignment
              byte[] newarr = new byte[newLen];
              System.arraycopy(arr, 0, newarr, 0, pos);
              arr = newarr;
              bytes[doc] = newarr;
            }
            pos = writeInt(delta, arr, pos);
            index[doc] = (pos<<8) | 1;  // update pointer to end index in byte[]
          } else {
            // OK, this int has data in it... find the end (a zero starting byte - not
            // part of another number, hence not following a byte with the high bit set).
            int ipos;
            if (val==0) {
              ipos=0;
            } else if ((val & 0x0000ff80)==0) {
              ipos=1;
            } else if ((val & 0x00ff8000)==0) {
              ipos=2;
            } else if ((val & 0xff800000)==0) {
              ipos=3;
            } else {
              ipos=4;
            }
            int endPos = writeInt(delta, tempArr, ipos);
            if (endPos <= 4) {
              // value will fit in the integer... move bytes back
              for (int j=ipos; j<endPos; j++) {
                val |= (tempArr[j] & 0xff) << (j<<3);
              }
              index[doc] = val;
            } else {
              // value won't fit... move integer into byte[]
              for (int j=0; j<ipos; j++) {
                tempArr[j] = (byte)val;
                val >>>=8;
              }
              // point at the end index in the byte[]
              index[doc] = (endPos<<8) | 1;
              bytes[doc] = tempArr;
              tempArr = new byte[12];
            }
          }
        }
      }
      te.next();
    }
    numTermsInField = te.getTermNumber();
    te.close();
    long midPoint = System.currentTimeMillis();
    if (termInstances == 0) {
      // we didn't invert anything
      // lower memory consumption.
      index = this.index = null;
      tnums = null;
    } else {
      //
      // transform intermediate form into the final form, building a single byte[]
      // at a time, and releasing the intermediate byte[]s as we go to avoid
      // increasing the memory footprint.
      //
      for (int pass = 0; pass<256; pass++) {
        byte[] target = tnums[pass];
        int pos=0;  // end in target;
        if (target != null) {
          pos = target.length;
        } else {
          target = new byte[4096];
        }
        // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx
        // where pp is the pass (which array we are building), and xx is all values.
        // each pass shares the same byte[] for termNumber lists.
        for (int docbase = pass<<16; docbase<maxDoc; docbase+=(1<<24)) {
          int lim = Math.min(docbase + (1<<16), maxDoc);
          for (int doc=docbase; doc<lim; doc++) {
            int val = index[doc];
            if ((val&0xff) == 1) {
              int len = val >>> 8;
              index[doc] = (pos<<8)|1; // change index to point to start of array
              if ((pos & 0xff000000) != 0) {
                // we only have 24 bits for the array index
                throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Too many values for UnInvertedField faceting on field "+field);
              }
              byte[] arr = bytes[doc];
              bytes[doc] = null;        // IMPORTANT: allow GC to avoid OOM
              if (target.length <= pos + len) {
                int newlen = target.length;
                /*** we don't have to worry about the array getting too large
                 * since the "pos" param will overflow first (only 24 bits available)
                if ((newlen<<1) <= 0) {
                  // overflow...
                  newlen = Integer.MAX_VALUE;
                  if (newlen <= pos + len) {
                    throw new SolrException(400,"Too many terms to uninvert field!");
                  }
                } else {
                  while (newlen <= pos + len) newlen<<=1;  // doubling strategy
                }
                ****/
                while (newlen <= pos + len) newlen<<=1;  // doubling strategy                 
                byte[] newtarget = new byte[newlen];
                System.arraycopy(target, 0, newtarget, 0, pos);
                target = newtarget;
              }
              System.arraycopy(arr, 0, target, pos, len);
              pos += len + 1;  // skip single byte at end and leave it 0 for terminator
            }
          }
        }
        // shrink array
        if (pos < target.length) {
          byte[] newtarget = new byte[pos];
          System.arraycopy(target, 0, newtarget, 0, pos);
          target = newtarget;
        }
        tnums[pass] = target;
        if ((pass << 16) > maxDoc)
          break;
      }
    }
    long endTime = System.currentTimeMillis();
    SolrCore.log.info("UnInverted multi-valued field " + field + ", memSize=" + memSize()
            + ", time="+(endTime-startTime)+", phase1="+(midPoint-startTime)
            + ", nTerms=" + numTermsInField + ", bigTerms=" + bigTerms.size()
            + ", termInstances=" + termInstances
            );
  }
  public NamedList getCounts(SolrIndexSearcher searcher, DocSet baseDocs, int offset, int limit, int mincount, boolean missing, boolean sort, String prefix) throws IOException {
    FieldType ft = searcher.getSchema().getFieldType(field);
    NamedList res = new NamedList();  // order is important
    DocSet docs = baseDocs;
    int baseSize = docs.size();
    int maxDoc = searcher.maxDoc();
    if (baseSize >= mincount) {
      final int[] index = this.index;
      final int[] counts = new int[numTermsInField];
      //
      // If there is prefix, find it's start and end term numbers
      //
      int startTerm = 0;
      int endTerm = numTermsInField;  // one past the end
      NumberedTermEnum te = ti.getEnumerator(searcher.getReader());
      if (prefix != null && prefix.length() > 0) {
        te.skipTo(prefix);
        startTerm = te.getTermNumber();
        te.skipTo(prefix + "\uffff\uffff\uffff\uffff");
        endTerm = te.getTermNumber();
      }
      /***********
      // Alternative 2: get the docSet of the prefix (could take a while) and
      // then do the intersection with the baseDocSet first.
      if (prefix != null && prefix.length() > 0) {
        docs = searcher.getDocSet(new ConstantScorePrefixQuery(new Term(field, ft.toInternal(prefix))), docs);
        // The issue with this method are problems of returning 0 counts for terms w/o
        // the prefix.  We can't just filter out those terms later because it may
        // mean that we didn't collect enough terms in the queue (in the sorted case).
      }
      ***********/
      boolean doNegative = baseSize > maxDoc >> 1 && termInstances > 0
              && startTerm==0 && endTerm==numTermsInField
              && docs instanceof BitDocSet;
      if (doNegative) {
        OpenBitSet bs = (OpenBitSet)((BitDocSet)docs).getBits().clone();
        bs.flip(0, maxDoc);
        // TODO: when iterator across negative elements is available, use that
        // instead of creating a new bitset and inverting.
        docs = new BitDocSet(bs, maxDoc - baseSize);
        // simply negating will mean that we have deleted docs in the set.
        // that should be OK, as their entries in our table should be empty.
      }
      // For the biggest terms, do straight set intersections
      for (TopTerm tt : bigTerms.values()) {
        // TODO: counts could be deferred if sorted==false
        if (tt.termNum >= startTerm && tt.termNum < endTerm) {
          counts[tt.termNum] = searcher.numDocs(new TermQuery(tt.term), docs);
        }
      }
      // TODO: we could short-circuit counting altogether for sorted faceting
      // where we already have enough terms from the bigTerms
      // TODO: we could shrink the size of the collection array, and
      // additionally break when the termNumber got above endTerm, but
      // it would require two extra conditionals in the inner loop (although
      // they would be predictable for the non-prefix case).
      // Perhaps a different copy of the code would be warranted.
      if (termInstances > 0) {
        DocIterator iter = docs.iterator();
        while (iter.hasNext()) {
          int doc = iter.nextDoc();
          int code = index[doc];
          if ((code & 0xff)==1) {
            int pos = code>>>8;
            int whichArray = (doc >>> 16) & 0xff;
            byte[] arr = tnums[whichArray];
            int tnum = 0;
            for(;;) {
              int delta = 0;
              for(;;) {
                byte b = arr[pos++];
                delta = (delta << 7) | (b & 0x7f);
                if ((b & 0x80) == 0) break;
              }
              if (delta == 0) break;
              tnum += delta - TNUM_OFFSET;
              counts[tnum]++;
            }
          } else {
            int tnum = 0;
            int delta = 0;
            for (;;) {
              delta = (delta << 7) | (code & 0x7f);
              if ((code & 0x80)==0) {
                if (delta==0) break;
                tnum += delta - TNUM_OFFSET;
                counts[tnum]++;
                delta = 0;
              }
              code >>>= 8;
            }
          }
        }
      }
      int off=offset;
      int lim=limit>=0 ? limit : Integer.MAX_VALUE;
      if (sort) {
        int maxsize = limit>0 ? offset+limit : Integer.MAX_VALUE-1;
        maxsize = Math.min(maxsize, numTermsInField);
        final BoundedTreeSet<Long> queue = new BoundedTreeSet<Long>(maxsize);
        int min=mincount-1;  // the smallest value in the top 'N' values
        for (int i=startTerm; i<endTerm; i++) {
          int c = doNegative ? maxTermCounts[i] - counts[i] : counts[i];
          if (c>min) {
            // NOTE: we use c>min rather than c>=min as an optimization because we are going in
            // index order, so we already know that the keys are ordered.  This can be very
            // important if a lot of the counts are repeated (like zero counts would be).
            // minimize object creation and speed comparison by creating a long that
            // encompases both count and term number.
            // Since smaller values are kept in the TreeSet, make higher counts smaller.
            //
            //   for equal counts, lower term numbers
            // should come first and hence be "greater"
            //long pair = (((long)c)<<32) | (0x7fffffff-i) ;   // use if priority queue
            long pair = (((long)-c)<<32) | i;
            queue.add(new Long(pair));
            if (queue.size()>=maxsize) min=-(int)(queue.last().longValue() >>> 32);
          }
        }
        // now select the right page from the results
        for (Long p : queue) {
          if (--off>=0) continue;
          if (--lim<0) break;
          int c = -(int)(p.longValue() >>> 32);
          //int tnum = 0x7fffffff - (int)p.longValue();  // use if priority queue
          int tnum = (int)p.longValue();
          String label = ft.indexedToReadable(getTermText(te, tnum));
          res.add(label, c);
        }
      } else {
        // add results in index order
        int i=startTerm;
        if (mincount<=0) {
          // if mincount<=0, then we won't discard any terms and we know exactly
          // where to start.
          i=startTerm+off;
          off=0;
        }
        for (; i<endTerm; i++) {
          int c = doNegative ? maxTermCounts[i] - counts[i] : counts[i];
          if (c==0) {
          }
          if (c<mincount || --off>=0) continue;
          if (--lim<0) break;
          String label = ft.indexedToReadable(getTermText(te, i));
          res.add(label, c);
        }
      }
      te.close();
    }
    if (missing) {
      // TODO: a faster solution for this?
      res.add(null, SimpleFacets.getFieldMissingCount(searcher, baseDocs, field));
    }
    return res;
  }
  String getTermText(NumberedTermEnum te, int termNum) throws IOException {
    if (bigTerms.size() > 0) {
      // see if the term is one of our big terms.
      TopTerm tt = bigTerms.get(termNum);
      if (tt != null) {
        return tt.term.text();
      }
    }
    te.skipTo(termNum);
    return te.term().text();
  }
  //////////////////////////////////////////////////////////////////
  //////////////////////////// caching /////////////////////////////
  //////////////////////////////////////////////////////////////////
  static final class CreationPlaceholder {
    Object value;
  }
  public static UnInvertedField getUnInvertedField(String field, SolrIndexSearcher searcher) throws IOException {
    return (UnInvertedField)multiValuedFieldCache.get(searcher, field);
  }
  static Cache multiValuedFieldCache = new Cache() {
    protected Object createValue(SolrIndexSearcher searcher, Object key) throws IOException {
      return new UnInvertedField((String)key, searcher);
    }
  };
    /** Internal cache. (from lucene FieldCache) */
  abstract static class Cache {
    private final Map readerCache = new WeakHashMap();
    protected abstract Object createValue(SolrIndexSearcher searcher, Object key) throws IOException;
    public Object get(SolrIndexSearcher searcher, Object key) throws IOException {
      Map innerCache;
      Object value;
      synchronized (readerCache) {
        innerCache = (Map) readerCache.get(searcher);
        if (innerCache == null) {
          innerCache = new HashMap();
          readerCache.put(searcher, innerCache);
          value = null;
        } else {
          value = innerCache.get(key);
        }
        if (value == null) {
          value = new CreationPlaceholder();
          innerCache.put(key, value);
        }
      }
      if (value instanceof CreationPlaceholder) {
        synchronized (value) {
          CreationPlaceholder progress = (CreationPlaceholder) value;
          if (progress.value == null) {
            progress.value = createValue(searcher, key);
            synchronized (readerCache) {
              innerCache.put(key, progress.value);
            }
          }
          return progress.value;
        }
      }
      return value;
    }
  }
 }
 // How to share TermDocs (int[] score[])???
 // Hot to share TermPositions?
 /***
 class TermEnumListener {
  void doTerm(Term t) {
  }
  void done() {
  }
 }
 ***/
 class NumberedTermEnum extends TermEnum {
  protected final IndexReader reader;
  protected final TermIndex tindex;
  protected TermEnum tenum;
  protected int pos=-1;
  protected Term t;
  protected TermDocs termDocs;
  NumberedTermEnum(IndexReader reader, TermIndex tindex) throws IOException {
    this.reader = reader;
    this.tindex = tindex;
  }
  NumberedTermEnum(IndexReader reader, TermIndex tindex, String termValue, int pos) throws IOException {
    this.reader = reader;
    this.tindex = tindex;
    this.pos = pos;
    tenum = reader.terms(tindex.createTerm(termValue));
    setTerm();
  }
  public TermDocs getTermDocs() throws IOException {
    if (termDocs==null) termDocs = reader.termDocs(t);
    else termDocs.seek(t);
    return termDocs;
  }
  protected boolean setTerm() {
    t = tenum.term();
    if (t==null || t.field() != tindex.fterm.field()) {  // intern'd compare
      t = null;
      return false;
    }
    return true;
  }
  public boolean next() throws IOException {
    pos++;
    boolean b = tenum.next();
    if (!b) {
      t = null;
      return false;
    }
    return setTerm();  // this is extra work if we know we are in bounds...
  }
  public Term term() {
    return t;
  }
  public int docFreq() {
    return tenum.docFreq();
  }
  public void close() throws IOException {
    tenum.close();
  }
  public boolean skipTo(String target) throws IOException {
    return skipTo(tindex.fterm.createTerm(target));
  }
  public boolean skipTo(Term target) throws IOException {
    // already here
    if (t != null && t.equals(target)) return true;
    int startIdx = Arrays.binarySearch(tindex.index,target.text());
    if (startIdx >= 0) {
      // we hit the term exactly... lucky us!
      tenum = reader.terms(target);
      pos = startIdx << tindex.intervalBits;
      return setTerm();
    }
    // we didn't hit the term exactly
    startIdx=-startIdx-1;
    if (startIdx == 0) {
      // our target occurs *before* the first term
      tenum = reader.terms(target);
      pos = 0;
      return setTerm();
    }
    // back up to the start of the block
    startIdx--;
    if ((pos >> tindex.intervalBits) == startIdx && t != null && t.text().compareTo(target.text())<=0) {
      // we are already in the right block and the current term is before the term we want,
      // so we don't need to seek.
    } else {
      // seek to the right block
      tenum = reader.terms(target.createTerm(tindex.index[startIdx]));
      pos = startIdx << tindex.intervalBits;
      setTerm();  // should be true since it's in the index
    }
    while (t != null && t.text().compareTo(target.text()) < 0) {
      next();
    }
    return t != null;
  }
  public boolean skipTo(int termNumber) throws IOException {
    int delta = termNumber - pos;
    if (delta < 0 || delta > tindex.interval || tenum==null) {
      int idx = termNumber >>> tindex.intervalBits;
      String base = tindex.index[idx];
      pos = idx << tindex.intervalBits;
      delta = termNumber - pos;
      tenum = reader.terms(tindex.createTerm(base));
    }
    while (--delta >= 0) {
      boolean b = tenum.next();
      if (b==false) {
        t = null;
        return false;
      }
      ++pos;
    }
    return setTerm();
  }
  /** The current term number, starting at 0.
   * Only valid if the previous call to next() or skipTo() returned true.
   */
  public int getTermNumber() {
    return pos;
  }
 }
 /**
 * Class to save memory by only storing every nth term (for random access), while
 * numbering the terms, allowing them to be retrieved later by number.
 * This is only valid when used with the IndexReader it was created with.
 * The IndexReader is not actually stored to facilitate caching by using it as a key in
 * a weak hash map.
 */
 class TermIndex {
  final static int intervalBits = 7;  // decrease to a low number like 2 for testing
  final static int intervalMask = 0xffffffff >>> (32-intervalBits);
  final static int interval = 1 << intervalBits;
  final Term fterm; // prototype to be used in term construction w/o String.intern overhead
  String[] index;
  int nTerms;
  long sizeOfStrings;
  TermIndex(String field) {
    this.fterm = new Term(field, "");
  }
  Term createTerm(String termVal) {
    return fterm.createTerm(termVal);
  }
  NumberedTermEnum getEnumerator(IndexReader reader, int termNumber) throws IOException {
    NumberedTermEnum te = new NumberedTermEnum(reader, this);
    te.skipTo(termNumber);
    return te;
  }
  /* The first time an enumerator is requested, it should be used
     with next() to fully traverse all of the terms so the index
     will be built.
   */
  NumberedTermEnum getEnumerator(IndexReader reader) throws IOException {
    if (index==null) return new NumberedTermEnum(reader,this,"",0) {
      ArrayList<String> lst;
      protected boolean setTerm() {
        boolean b = super.setTerm();
        if (b && (pos & intervalMask)==0) {
          String text = term().text();
          sizeOfStrings += text.length() << 1;
          if (lst==null) {
            lst = new ArrayList<String>();
          }
          lst.add(text);
        }
        return b;
      }
      public boolean skipTo(Term target) throws IOException {
        throw new UnsupportedOperationException();
      }
      public boolean skipTo(int termNumber) throws IOException {
        throw new UnsupportedOperationException();
      }
      public void close() throws IOException {
        nTerms=pos;
        super.close();
        index = lst!=null ? lst.toArray(new String[lst.size()]) : new String[0];
      }
    };
    else return new NumberedTermEnum(reader,this,"",0);
  }
  /**
   * Returns the approximate amount of memory taken by this DocSet.
   * This is only an approximation and doesn't take into account java object overhead.
   *
   * @return
   * the approximate memory consumption in bytes
   */
  public long memSize() {
    // assume 8 byte references?
    return 8+8+8+8+(index.length<<3)+sizeOfStrings;
  }
 }
--- a/src/test/org/apache/solr/request/SimpleFacetsTest.java
+++ b/src/test/org/apache/solr/request/SimpleFacetsTest.java
@ -303,9 +303,10 @@ public class SimpleFacetsTest extends AbstractSolrTestCase {
  }
  public void testFacetMultiValued() {
-    doFacets("t_s");
+    doFacetPrefix("t_s", "facet.method","enum");
-    doFacets("t_s", "facet.enum.cache.minDf", "2");
+    doFacetPrefix("t_s", "facet.method", "enum", "facet.enum.cache.minDf", "2");
-    doFacets("t_s", "facet.enum.cache.minDf", "100");
+    doFacetPrefix("t_s", "facet.method", "enum", "facet.enum.cache.minDf", "100");
    doFacetPrefix("t_s", "facet.method", "fc");
  }
  public void testFacetSingleValued() {
@ -476,9 +477,10 @@ public class SimpleFacetsTest extends AbstractSolrTestCase {
  public void testFacetPrefixMultiValued() {
-    doFacetPrefix("t_s");   
+    doFacetPrefix("t_s", "facet.method","enum");
-    doFacetPrefix("t_s", "facet.enum.cache.minDf", "3");
+    doFacetPrefix("t_s", "facet.method", "enum", "facet.enum.cache.minDf", "3");
-    doFacetPrefix("t_s", "facet.enum.cache.minDf", "100");
+    doFacetPrefix("t_s", "facet.method", "enum", "facet.enum.cache.minDf", "100");
    doFacetPrefix("t_s", "facet.method", "fc");
  }
  public void testFacetPrefixSingleValued() {
--- a/src/test/org/apache/solr/request/TestFaceting.java
+++ b/src/test/org/apache/solr/request/TestFaceting.java
@ -0,0 +1,242 @@
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.solr.request;
 import org.apache.lucene.index.Term;
 import org.apache.solr.util.AbstractSolrTestCase;
 import java.util.Random;
 /**
 * @version $Id$
 */
 public class TestFaceting extends AbstractSolrTestCase {
  public String getSchemaFile() { return "schema11.xml"; }
  public String getSolrConfigFile() { return "solrconfig.xml"; }
  public void setUp() throws Exception {
    super.setUp();
  }
  public void tearDown() throws Exception {
    close();
    super.tearDown();
  }
  String t(int tnum) {
    return String.format("%08d", tnum);
  }
  void createIndex(int nTerms) {
    assertU(delQ("*:*"));
    for (int i=0; i<nTerms; i++) {
      assertU(adoc("id", Float.toString(i), proto.field(), t(i) ));
    }
    assertU(optimize()); // squeeze out any possible deleted docs
  }
  Term proto = new Term("field_s","");
  SolrQueryRequest req; // used to get a searcher
  void close() {
    if (req!=null) req.close();
    req = null;
  }
  void doTermEnum(int size) throws Exception {
    close();
    createIndex(size);
    req = lrf.makeRequest("q","*:*");
    TermIndex ti = new TermIndex(proto.field());
    NumberedTermEnum te = ti.getEnumerator(req.getSearcher().getReader());
    // iterate through first
    while(te.term() != null) te.next();
    assertEquals(size, te.getTermNumber());
    te.close();
    te = ti.getEnumerator(req.getSearcher().getReader());
    Random r = new Random(size);
    // test seeking by term string
    for (int i=0; i<size*2+10; i++) {
      int rnum = r.nextInt(size+2);
      String s = t(rnum);
      boolean b = te.skipTo(proto.createTerm(s));
      assertEquals(b, rnum < size);
      if (rnum < size) {
        assertEquals(rnum, te.pos);
        assertEquals(s, te.term().text());
      } else {
        assertEquals(null, te.term());
        assertEquals(size, te.getTermNumber());
      }
    }
    // test seeking before term
    assertEquals(size>0, te.skipTo(proto.createTerm("000")));
    assertEquals(0, te.getTermNumber());
    if (size>0) {
      assertEquals(t(0), te.term().text());
    } else {
      assertEquals(null, te.term());
    }
    if (size>0) {
      // test seeking by term number
      for (int i=0; i<size*2+10; i++) {
        int rnum = r.nextInt(size);
        String s = t(rnum);
        boolean b = te.skipTo(rnum);
        assertEquals(true, b);
        assertEquals(rnum, te.pos);
        assertEquals(s, te.term().text());
      }
    }
  }
  public void testTermEnum() throws Exception {
    doTermEnum(0);
    doTermEnum(1);
    doTermEnum(TermIndex.interval - 1);  // test boundaries around the block size
    doTermEnum(TermIndex.interval);
    doTermEnum(TermIndex.interval + 1);
    doTermEnum(TermIndex.interval * 2 + 2);    
    // doTermEnum(TermIndex.interval * 3 + 3);    
  }
  public void testFacets() throws Exception {
    StringBuilder sb = new StringBuilder();
    // go over 4096 to test some of the buffer resizing
    for (int i=0; i<5000; i++) {
      sb.append(t(i));
      sb.append(' ');     
    }
    assertU(adoc("id", "1", "many_ws", sb.toString()));
    assertU(commit());
    assertQ("check many tokens",
            req("q", "id:1","indent","true"
                ,"facet", "true", "facet.method","fc"
                ,"facet.field", "many_ws"
                ,"facet.limit", "-1"
                )
            ,"*[count(//lst[@name='many_ws']/int)=5000]"
            ,"//lst[@name='many_ws']/int[@name='" + t(0) + "'][.='1']"
            ,"//lst[@name='many_ws']/int[@name='" + t(1) + "'][.='1']"
            ,"//lst[@name='many_ws']/int[@name='" + t(2) + "'][.='1']"
            ,"//lst[@name='many_ws']/int[@name='" + t(3) + "'][.='1']"
            ,"//lst[@name='many_ws']/int[@name='" + t(4) + "'][.='1']"
            ,"//lst[@name='many_ws']/int[@name='" + t(5) + "'][.='1']"
            ,"//lst[@name='many_ws']/int[@name='" + t(4092) + "'][.='1']"
            ,"//lst[@name='many_ws']/int[@name='" + t(4093) + "'][.='1']"
            ,"//lst[@name='many_ws']/int[@name='" + t(4094) + "'][.='1']"
            ,"//lst[@name='many_ws']/int[@name='" + t(4095) + "'][.='1']"
            ,"//lst[@name='many_ws']/int[@name='" + t(4096) + "'][.='1']"
            ,"//lst[@name='many_ws']/int[@name='" + t(4097) + "'][.='1']"
            ,"//lst[@name='many_ws']/int[@name='" + t(4098) + "'][.='1']"
            ,"//lst[@name='many_ws']/int[@name='" + t(4090) + "'][.='1']"
            ,"//lst[@name='many_ws']/int[@name='" + t(4999) + "'][.='1']"
            );
    // test gaps that take more than one byte
    sb = new StringBuilder();
    sb.append(t(0)).append(' ');
    sb.append(t(150)).append(' ');
    sb.append(t(301)).append(' ');
    sb.append(t(453)).append(' ');
    sb.append(t(606)).append(' ');
    sb.append(t(1000)).append(' ');
    sb.append(t(2010)).append(' ');
    sb.append(t(3050)).append(' ');
    sb.append(t(4999)).append(' ');
    assertU(adoc("id", "2", "many_ws", sb.toString()));
    assertQ("check many tokens",
            req("q", "id:1","indent","true"
                ,"facet", "true", "facet.method","fc"
                ,"facet.field", "many_ws"
                ,"facet.limit", "-1"
                )
            ,"*[count(//lst[@name='many_ws']/int)=5000]"
            ,"//lst[@name='many_ws']/int[@name='" + t(0) + "'][.='1']"
            ,"//lst[@name='many_ws']/int[@name='" + t(150) + "'][.='1']"
            ,"//lst[@name='many_ws']/int[@name='" + t(301) + "'][.='1']"
            ,"//lst[@name='many_ws']/int[@name='" + t(453) + "'][.='1']"
            ,"//lst[@name='many_ws']/int[@name='" + t(606) + "'][.='1']"
            ,"//lst[@name='many_ws']/int[@name='" + t(1000) + "'][.='1']"
            ,"//lst[@name='many_ws']/int[@name='" + t(2010) + "'][.='1']"
            ,"//lst[@name='many_ws']/int[@name='" + t(3050) + "'][.='1']"
            ,"//lst[@name='many_ws']/int[@name='" + t(4999) + "'][.='1']"
              );
  }
  public void testRegularBig() throws Exception {
    StringBuilder sb = new StringBuilder();
    // go over 4096 to test some of the buffer resizing
    int nTerms=7;
    for (int i=0; i<nTerms; i++) {
      sb.append(t(i));
      sb.append(' ');
    }
    String many_ws = sb.toString();
    int i1=1000000;
    // int iter=65536+10;
    int iter=1000;
    for (int i=0; i<iter; i++) {
      // assertU(adoc("id", t(i), "many_ws", many_ws + t(i1+i) + " " + t(i1*2+i)));
      assertU(adoc("id", t(i), "many_ws", t(i1+i) + " " + t(i1*2+i)));
    }
    assertU(commit());
    for (int i=0; i<iter; i+=iter/10) {
    assertQ("check many tokens",
            req("q", "id:"+t(i),"indent","true"
                ,"facet", "true", "facet.method","fc"
                ,"facet.field", "many_ws"
                ,"facet.limit", "-1"
                ,"facet.mincount", "1"
                )
            ,"*[count(//lst[@name='many_ws']/int)=" + 2 + "]"
            ,"//lst[@name='many_ws']/int[@name='" + t(i1+i) + "'][.='1']"
            ,"//lst[@name='many_ws']/int[@name='" + t(i1*2+i) + "'][.='1']"
            );
    }
    int i=iter-1;
    assertQ("check many tokens",
            req("q", "id:"+t(i),"indent","true"
                ,"facet", "true", "facet.method","fc"
                ,"facet.field", "many_ws"
                ,"facet.limit", "-1"
                ,"facet.mincount", "1"
                )
            ,"*[count(//lst[@name='many_ws']/int)=" + 2 + "]"
            ,"//lst[@name='many_ws']/int[@name='" + t(i1+i) + "'][.='1']"
            ,"//lst[@name='many_ws']/int[@name='" + t(i1*2+i) + "'][.='1']"
            );
  }
 }
--- a/src/test/test-files/solr/conf/schema11.xml
+++ b/src/test/test-files/solr/conf/schema11.xml
@ -285,6 +285,7 @@
   <dynamicField name="*_t"  type="text"    indexed="true"  stored="true"/>
   <dynamicField name="*_b"  type="boolean" indexed="true"  stored="true"/>
   <dynamicField name="*_dt" type="date"    indexed="true"  stored="true"/>
   <dynamicField name="*_ws" type="text_ws" indexed="true"  stored="true"/>
   <dynamicField name="*_extf" type="file"/>