From 156491848a15a8dfdd51f5a51934f383a2f1c8fb Mon Sep 17 00:00:00 2001 From: Yonik Seeley Date: Tue, 25 Nov 2008 04:02:09 +0000 Subject: [PATCH] SOLR-475:multi-valued faceting via un-inverted field git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@720403 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 5 + .../solr/common/params/FacetParams.java | 14 + .../org/apache/solr/request/SimpleFacets.java | 24 +- .../apache/solr/request/UnInvertedField.java | 908 ++++++++++++++++++ .../apache/solr/request/SimpleFacetsTest.java | 14 +- .../org/apache/solr/request/TestFaceting.java | 242 +++++ src/test/test-files/solr/conf/schema11.xml | 1 + 7 files changed, 1198 insertions(+), 10 deletions(-) create mode 100755 src/java/org/apache/solr/request/UnInvertedField.java create mode 100755 src/test/org/apache/solr/request/TestFaceting.java diff --git a/CHANGES.txt b/CHANGES.txt index 165ace00d0d..8f6fa094750 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -96,6 +96,11 @@ Optimizations 2. SOLR-808: Write string keys in Maps as extern strings in the javabin format. (Noble Paul via shalin) + 3. SOLR-475: New faceting method with better performance and smaller memory usage for + multi-valued fields with many unique values but relatively few values per document. + Controllable via the facet.method parameter - "fc" is the new default method and "enum" + is the original method. (yonik) + Bug Fixes ---------------------- diff --git a/src/java/org/apache/solr/common/params/FacetParams.java b/src/java/org/apache/solr/common/params/FacetParams.java index 79e5fb3cc15..a86f9c888df 100644 --- a/src/java/org/apache/solr/common/params/FacetParams.java +++ b/src/java/org/apache/solr/common/params/FacetParams.java @@ -29,6 +29,20 @@ public interface FacetParams { * Should facet counts be calculated? */ public static final String FACET = "facet"; + + /** What method should be used to do the faceting */ + public static final String FACET_METHOD = FACET + ".method"; + + /** Value for FACET_METHOD param to indicate that Solr should enumerate over terms + * in a field to calculate the facet counts. + */ + public static final String FACET_METHOD_enum = "enum"; + + /** Value for FACET_METHOD param to indicate that Solr should enumerate over documents + * and count up terms by consulting an uninverted representation of the field values + * (such as the FieldCache used for sorting). + */ + public static final String FACET_METHOD_fc = "fc"; /** * Any lucene formated queries the user would like to use for diff --git a/src/java/org/apache/solr/request/SimpleFacets.java b/src/java/org/apache/solr/request/SimpleFacets.java index f33975a2ffc..5e330c0793d 100644 --- a/src/java/org/apache/solr/request/SimpleFacets.java +++ b/src/java/org/apache/solr/request/SimpleFacets.java @@ -148,16 +148,32 @@ public class SimpleFacets { boolean sort = params.getFieldBool(field, FacetParams.FACET_SORT, limit>0); String prefix = params.getFieldParam(field,FacetParams.FACET_PREFIX); + NamedList counts; SchemaField sf = searcher.getSchema().getField(field); FieldType ft = sf.getType(); - if (sf.multiValued() || ft.isTokenized() || ft instanceof BoolField) { + + // determine what type of faceting method to use + String method = params.getFieldParam(field, FacetParams.FACET_METHOD); + boolean enumMethod = FacetParams.FACET_METHOD_enum.equals(method); + if (method == null && ft instanceof BoolField) { // Always use filters for booleans... we know the number of values is very small. + enumMethod = true; + } + boolean multiToken = sf.multiValued() || ft.isTokenized(); + + // unless the enum method is explicitly specified, use a counting method. + if (enumMethod) { counts = getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount,missing,sort,prefix); } else { - // TODO: future logic could use filters instead of the fieldcache if - // the number of terms in the field is small enough. - counts = getFieldCacheCounts(searcher, docs, field, offset,limit, mincount, missing, sort, prefix); + if (multiToken) { + UnInvertedField uif = UnInvertedField.getUnInvertedField(field, searcher); + counts = uif.getCounts(searcher, docs, offset, limit, mincount,missing,sort,prefix); + } else { + // TODO: future logic could use filters instead of the fieldcache if + // the number of terms in the field is small enough. + counts = getFieldCacheCounts(searcher, docs, field, offset,limit, mincount, missing, sort, prefix); + } } return counts; diff --git a/src/java/org/apache/solr/request/UnInvertedField.java b/src/java/org/apache/solr/request/UnInvertedField.java new file mode 100755 index 00000000000..54af6e27622 --- /dev/null +++ b/src/java/org/apache/solr/request/UnInvertedField.java @@ -0,0 +1,908 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.request; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.search.TermQuery; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.common.SolrException; +import org.apache.solr.core.SolrCore; +import org.apache.solr.request.SimpleFacets; +import org.apache.solr.schema.FieldType; +import org.apache.solr.search.BitDocSet; +import org.apache.solr.search.DocIterator; +import org.apache.solr.search.DocSet; +import org.apache.solr.search.SolrIndexSearcher; +import org.apache.solr.util.BoundedTreeSet; +import org.apache.solr.util.OpenBitSet; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.WeakHashMap; + +/** + * + * Final form of the un-inverted field: + * Each document points to a list of term numbers that are contained in that document. + * + * Term numbers are in sorted order, and are encoded as variable-length deltas from the + * previous term number. Real term numbers start at 2 since 0 and 1 are reserved. A + * term number of 0 signals the end of the termNumber list. + * + * There is a singe int[maxDoc()] which either contains a pointer into a byte[] for + * the termNumber lists, or directly contains the termNumber list if it fits in the 4 + * bytes of an integer. If the first byte in the integer is 1, the next 3 bytes + * are a pointer into a byte[] where the termNumber list starts. + * + * There are actually 256 byte arrays, to compensate for the fact that the pointers + * into the byte arrays are only 3 bytes long. The correct byte array for a document + * is a function of it's id. + * + * To save space and speed up faceting, any term that matches enough documents will + * not be un-inverted... it will be skipped while building the un-inverted field structure, + * and will use a set intersection method during faceting. + * + * To further save memory, the terms (the actual string values) are not all stored in + * memory, but a TermIndex is used to convert term numbers to term values only + * for the terms needed after faceting has completed. Only every 128th term value + * is stored, along with it's corresponding term number, and this is used as an + * index to find the closest term and iterate until the desired number is hit (very + * much like Lucene's own internal term index). + * + */ +class UnInvertedField { + private static int TNUM_OFFSET=2; + + static class TopTerm { + Term term; + int termNum; + + long memSize() { + return 8 + // obj header + 8 + 8 +(term.text().length()<<1) + //term + 4; // int + } + } + + String field; + int numTermsInField; + int termsInverted; // number of unique terms that were un-inverted + long termInstances; // total number of references to term numbers + final TermIndex ti; + + int[] index; + byte[][] tnums = new byte[256][]; + int[] maxTermCounts; + final Map bigTerms = new LinkedHashMap(); + + + public long memSize() { + long sz = 6*8 + 12; // local fields + sz += bigTerms.size() * 64; + for (TopTerm tt : bigTerms.values()) { + sz += tt.memSize(); + } + if (index != null) sz += index.length * 4; + if (tnums!=null) { + for (byte[] arr : tnums) + if (arr != null) sz += arr.length; + } + if (maxTermCounts != null) + sz += maxTermCounts.length * 4; + sz += ti.memSize(); + return sz; + } + + + /** Number of bytes to represent an unsigned int as a vint. */ + static int vIntSize(int x) { + if ((x & (0xffffffff << (7*1))) == 0 ) { + return 1; + } + if ((x & (0xffffffff << (7*2))) == 0 ) { + return 2; + } + if ((x & (0xffffffff << (7*3))) == 0 ) { + return 3; + } + if ((x & (0xffffffff << (7*4))) == 0 ) { + return 4; + } + return 5; + } + + + // todo: if we know the size of the vInt already, we could do + // a single switch on the size + static int writeInt(int x, byte[] arr, int pos) { + int a; + a = (x >>> (7*4)); + if (a != 0) { + arr[pos++] = (byte)(a | 0x80); + } + a = (x >>> (7*3)); + if (a != 0) { + arr[pos++] = (byte)(a | 0x80); + } + a = (x >>> (7*2)); + if (a != 0) { + arr[pos++] = (byte)(a | 0x80); + } + a = (x >>> (7*1)); + if (a != 0) { + arr[pos++] = (byte)(a | 0x80); + } + arr[pos++] = (byte)(x & 0x7f); + return pos; + } + + + + public UnInvertedField(String field, SolrIndexSearcher searcher) throws IOException { + this.field = field; + this.ti = new TermIndex(field); + uninvert(searcher); + } + + + private void uninvert(SolrIndexSearcher searcher) throws IOException { + long startTime = System.currentTimeMillis(); + + IndexReader reader = searcher.getReader(); + int maxDoc = reader.maxDoc(); + + int[] index = new int[maxDoc]; // immediate term numbers, or the index into the byte[] representing the last number + this.index = index; + final int[] lastTerm = new int[maxDoc]; // last term we saw for this document + final byte[][] bytes = new byte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts) + maxTermCounts = new int[1024]; + + NumberedTermEnum te = ti.getEnumerator(reader); + + // threshold, over which we use set intersections instead of counting + // to (1) save memory, and (2) speed up faceting. + // Add 2 for testing purposes so that there will always be some terms under + // the threshold even when the index is very small. + int threshold = maxDoc / 20 + 2; + // threshold = 2000000000; //////////////////////////////// USE FOR TESTING + int[] docs = new int[1000]; + int[] freqs = new int[1000]; + + // we need a minimum of 9 bytes, but round up to 12 since the space would + // be wasted with most allocators anyway. + byte[] tempArr = new byte[12]; + + // + // enumerate all terms, and build an intermediate form of the un-inverted field. + // + // During this intermediate form, every document has a (potential) byte[] + // and the int[maxDoc()] array either contains the termNumber list directly + // or the *end* offset of the termNumber list in it's byte array (for faster + // appending and faster creation of the final form). + // + // idea... if things are too large while building, we could do a range of docs + // at a time (but it would be a fair amount slower to build) + // could also do ranges in parallel to take advantage of multiple CPUs + + // OPTIONAL: remap the largest df terms to the lowest 128 (single byte) + // values. This requires going over the field first to find the most + // frequent terms ahead of time. + + for (;;) { + Term t = te.term(); + if (t==null) break; + + int termNum = te.getTermNumber(); + + if (termNum >= maxTermCounts.length) { + // resize, but conserve memory by not doubling + // resize at end??? we waste a maximum of 16K (average of 8K) + int[] newMaxTermCounts = new int[maxTermCounts.length+4096]; + System.arraycopy(maxTermCounts, 0, newMaxTermCounts, 0, termNum); + maxTermCounts = newMaxTermCounts; + } + + int df = te.docFreq(); + if (df >= threshold) { + TopTerm topTerm = new TopTerm(); + topTerm.term = t; + topTerm.termNum = termNum; + bigTerms.put(topTerm.termNum, topTerm); + + DocSet set = searcher.getDocSet(new TermQuery(topTerm.term)); + maxTermCounts[termNum] = set.size(); + + te.next(); + continue; + } + + termsInverted++; + + TermDocs td = te.getTermDocs(); + td.seek(te); + for(;;) { + int n = td.read(docs,freqs); + if (n <= 0) break; + + maxTermCounts[termNum] += n; + + for (int i=0; i>> 8; + int ilen = vIntSize(delta); + byte[] arr = bytes[doc]; + int newend = pos+ilen; + if (newend > arr.length) { + // We avoid a doubling strategy to lower memory usage. + // this faceting method isn't for docs with many terms. + // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary. + // TODO: figure out what array lengths we can round up to w/o actually using more memory + // (how much space does a byte[] take up? Is data preceded by a 32 bit length only? + // It should be safe to round up to the nearest 32 bits in any case. + int newLen = (newend + 3) & 0xfffffffc; // 4 byte alignment + byte[] newarr = new byte[newLen]; + System.arraycopy(arr, 0, newarr, 0, pos); + arr = newarr; + bytes[doc] = newarr; + } + pos = writeInt(delta, arr, pos); + index[doc] = (pos<<8) | 1; // update pointer to end index in byte[] + } else { + // OK, this int has data in it... find the end (a zero starting byte - not + // part of another number, hence not following a byte with the high bit set). + int ipos; + if (val==0) { + ipos=0; + } else if ((val & 0x0000ff80)==0) { + ipos=1; + } else if ((val & 0x00ff8000)==0) { + ipos=2; + } else if ((val & 0xff800000)==0) { + ipos=3; + } else { + ipos=4; + } + + int endPos = writeInt(delta, tempArr, ipos); + if (endPos <= 4) { + // value will fit in the integer... move bytes back + for (int j=ipos; j>>=8; + } + // point at the end index in the byte[] + index[doc] = (endPos<<8) | 1; + bytes[doc] = tempArr; + tempArr = new byte[12]; + } + + } + + } + + } + + te.next(); + } + + numTermsInField = te.getTermNumber(); + te.close(); + + long midPoint = System.currentTimeMillis(); + + if (termInstances == 0) { + // we didn't invert anything + // lower memory consumption. + index = this.index = null; + tnums = null; + } else { + + // + // transform intermediate form into the final form, building a single byte[] + // at a time, and releasing the intermediate byte[]s as we go to avoid + // increasing the memory footprint. + // + for (int pass = 0; pass<256; pass++) { + byte[] target = tnums[pass]; + int pos=0; // end in target; + if (target != null) { + pos = target.length; + } else { + target = new byte[4096]; + } + + // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx + // where pp is the pass (which array we are building), and xx is all values. + // each pass shares the same byte[] for termNumber lists. + for (int docbase = pass<<16; docbase>> 8; + index[doc] = (pos<<8)|1; // change index to point to start of array + if ((pos & 0xff000000) != 0) { + // we only have 24 bits for the array index + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Too many values for UnInvertedField faceting on field "+field); + } + byte[] arr = bytes[doc]; + bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM + if (target.length <= pos + len) { + int newlen = target.length; + /*** we don't have to worry about the array getting too large + * since the "pos" param will overflow first (only 24 bits available) + if ((newlen<<1) <= 0) { + // overflow... + newlen = Integer.MAX_VALUE; + if (newlen <= pos + len) { + throw new SolrException(400,"Too many terms to uninvert field!"); + } + } else { + while (newlen <= pos + len) newlen<<=1; // doubling strategy + } + ****/ + while (newlen <= pos + len) newlen<<=1; // doubling strategy + byte[] newtarget = new byte[newlen]; + System.arraycopy(target, 0, newtarget, 0, pos); + target = newtarget; + } + System.arraycopy(arr, 0, target, pos, len); + pos += len + 1; // skip single byte at end and leave it 0 for terminator + } + } + } + + // shrink array + if (pos < target.length) { + byte[] newtarget = new byte[pos]; + System.arraycopy(target, 0, newtarget, 0, pos); + target = newtarget; + } + + tnums[pass] = target; + + if ((pass << 16) > maxDoc) + break; + } + } + + long endTime = System.currentTimeMillis(); + + SolrCore.log.info("UnInverted multi-valued field " + field + ", memSize=" + memSize() + + ", time="+(endTime-startTime)+", phase1="+(midPoint-startTime) + + ", nTerms=" + numTermsInField + ", bigTerms=" + bigTerms.size() + + ", termInstances=" + termInstances + ); + } + + + + + public NamedList getCounts(SolrIndexSearcher searcher, DocSet baseDocs, int offset, int limit, int mincount, boolean missing, boolean sort, String prefix) throws IOException { + FieldType ft = searcher.getSchema().getFieldType(field); + + NamedList res = new NamedList(); // order is important + + DocSet docs = baseDocs; + int baseSize = docs.size(); + int maxDoc = searcher.maxDoc(); + + if (baseSize >= mincount) { + + final int[] index = this.index; + final int[] counts = new int[numTermsInField]; + + // + // If there is prefix, find it's start and end term numbers + // + int startTerm = 0; + int endTerm = numTermsInField; // one past the end + + NumberedTermEnum te = ti.getEnumerator(searcher.getReader()); + if (prefix != null && prefix.length() > 0) { + te.skipTo(prefix); + startTerm = te.getTermNumber(); + te.skipTo(prefix + "\uffff\uffff\uffff\uffff"); + endTerm = te.getTermNumber(); + } + + /*********** + // Alternative 2: get the docSet of the prefix (could take a while) and + // then do the intersection with the baseDocSet first. + if (prefix != null && prefix.length() > 0) { + docs = searcher.getDocSet(new ConstantScorePrefixQuery(new Term(field, ft.toInternal(prefix))), docs); + // The issue with this method are problems of returning 0 counts for terms w/o + // the prefix. We can't just filter out those terms later because it may + // mean that we didn't collect enough terms in the queue (in the sorted case). + } + ***********/ + + boolean doNegative = baseSize > maxDoc >> 1 && termInstances > 0 + && startTerm==0 && endTerm==numTermsInField + && docs instanceof BitDocSet; + + if (doNegative) { + OpenBitSet bs = (OpenBitSet)((BitDocSet)docs).getBits().clone(); + bs.flip(0, maxDoc); + // TODO: when iterator across negative elements is available, use that + // instead of creating a new bitset and inverting. + docs = new BitDocSet(bs, maxDoc - baseSize); + // simply negating will mean that we have deleted docs in the set. + // that should be OK, as their entries in our table should be empty. + } + + // For the biggest terms, do straight set intersections + for (TopTerm tt : bigTerms.values()) { + // TODO: counts could be deferred if sorted==false + if (tt.termNum >= startTerm && tt.termNum < endTerm) { + counts[tt.termNum] = searcher.numDocs(new TermQuery(tt.term), docs); + } + } + + // TODO: we could short-circuit counting altogether for sorted faceting + // where we already have enough terms from the bigTerms + + // TODO: we could shrink the size of the collection array, and + // additionally break when the termNumber got above endTerm, but + // it would require two extra conditionals in the inner loop (although + // they would be predictable for the non-prefix case). + // Perhaps a different copy of the code would be warranted. + + if (termInstances > 0) { + DocIterator iter = docs.iterator(); + while (iter.hasNext()) { + int doc = iter.nextDoc(); + int code = index[doc]; + + if ((code & 0xff)==1) { + int pos = code>>>8; + int whichArray = (doc >>> 16) & 0xff; + byte[] arr = tnums[whichArray]; + int tnum = 0; + for(;;) { + int delta = 0; + for(;;) { + byte b = arr[pos++]; + delta = (delta << 7) | (b & 0x7f); + if ((b & 0x80) == 0) break; + } + if (delta == 0) break; + tnum += delta - TNUM_OFFSET; + counts[tnum]++; + } + } else { + int tnum = 0; + int delta = 0; + for (;;) { + delta = (delta << 7) | (code & 0x7f); + if ((code & 0x80)==0) { + if (delta==0) break; + tnum += delta - TNUM_OFFSET; + counts[tnum]++; + delta = 0; + } + code >>>= 8; + } + } + } + } + + int off=offset; + int lim=limit>=0 ? limit : Integer.MAX_VALUE; + + if (sort) { + int maxsize = limit>0 ? offset+limit : Integer.MAX_VALUE-1; + maxsize = Math.min(maxsize, numTermsInField); + final BoundedTreeSet queue = new BoundedTreeSet(maxsize); + int min=mincount-1; // the smallest value in the top 'N' values + for (int i=startTerm; imin) { + // NOTE: we use c>min rather than c>=min as an optimization because we are going in + // index order, so we already know that the keys are ordered. This can be very + // important if a lot of the counts are repeated (like zero counts would be). + + // minimize object creation and speed comparison by creating a long that + // encompases both count and term number. + // Since smaller values are kept in the TreeSet, make higher counts smaller. + // + // for equal counts, lower term numbers + // should come first and hence be "greater" + + //long pair = (((long)c)<<32) | (0x7fffffff-i) ; // use if priority queue + long pair = (((long)-c)<<32) | i; + queue.add(new Long(pair)); + if (queue.size()>=maxsize) min=-(int)(queue.last().longValue() >>> 32); + } + } + // now select the right page from the results + for (Long p : queue) { + if (--off>=0) continue; + if (--lim<0) break; + int c = -(int)(p.longValue() >>> 32); + //int tnum = 0x7fffffff - (int)p.longValue(); // use if priority queue + int tnum = (int)p.longValue(); + String label = ft.indexedToReadable(getTermText(te, tnum)); + res.add(label, c); + } + } else { + // add results in index order + int i=startTerm; + if (mincount<=0) { + // if mincount<=0, then we won't discard any terms and we know exactly + // where to start. + i=startTerm+off; + off=0; + } + + for (; i=0) continue; + if (--lim<0) break; + + String label = ft.indexedToReadable(getTermText(te, i)); + res.add(label, c); + } + } + + te.close(); + } + + + if (missing) { + // TODO: a faster solution for this? + res.add(null, SimpleFacets.getFieldMissingCount(searcher, baseDocs, field)); + } + + return res; + } + + + String getTermText(NumberedTermEnum te, int termNum) throws IOException { + if (bigTerms.size() > 0) { + // see if the term is one of our big terms. + TopTerm tt = bigTerms.get(termNum); + if (tt != null) { + return tt.term.text(); + } + } + + te.skipTo(termNum); + return te.term().text(); + } + + + ////////////////////////////////////////////////////////////////// + //////////////////////////// caching ///////////////////////////// + ////////////////////////////////////////////////////////////////// + static final class CreationPlaceholder { + Object value; + } + + public static UnInvertedField getUnInvertedField(String field, SolrIndexSearcher searcher) throws IOException { + return (UnInvertedField)multiValuedFieldCache.get(searcher, field); + } + + static Cache multiValuedFieldCache = new Cache() { + protected Object createValue(SolrIndexSearcher searcher, Object key) throws IOException { + return new UnInvertedField((String)key, searcher); + } + }; + + /** Internal cache. (from lucene FieldCache) */ + abstract static class Cache { + private final Map readerCache = new WeakHashMap(); + + protected abstract Object createValue(SolrIndexSearcher searcher, Object key) throws IOException; + + public Object get(SolrIndexSearcher searcher, Object key) throws IOException { + Map innerCache; + Object value; + synchronized (readerCache) { + innerCache = (Map) readerCache.get(searcher); + if (innerCache == null) { + innerCache = new HashMap(); + readerCache.put(searcher, innerCache); + value = null; + } else { + value = innerCache.get(key); + } + if (value == null) { + value = new CreationPlaceholder(); + innerCache.put(key, value); + } + } + if (value instanceof CreationPlaceholder) { + synchronized (value) { + CreationPlaceholder progress = (CreationPlaceholder) value; + if (progress.value == null) { + progress.value = createValue(searcher, key); + synchronized (readerCache) { + innerCache.put(key, progress.value); + } + } + return progress.value; + } + } + + return value; + } + } +} + + +// How to share TermDocs (int[] score[])??? +// Hot to share TermPositions? +/*** +class TermEnumListener { + void doTerm(Term t) { + } + void done() { + } +} +***/ + + +class NumberedTermEnum extends TermEnum { + protected final IndexReader reader; + protected final TermIndex tindex; + protected TermEnum tenum; + protected int pos=-1; + protected Term t; + protected TermDocs termDocs; + + + NumberedTermEnum(IndexReader reader, TermIndex tindex) throws IOException { + this.reader = reader; + this.tindex = tindex; + } + + + NumberedTermEnum(IndexReader reader, TermIndex tindex, String termValue, int pos) throws IOException { + this.reader = reader; + this.tindex = tindex; + this.pos = pos; + tenum = reader.terms(tindex.createTerm(termValue)); + setTerm(); + } + + public TermDocs getTermDocs() throws IOException { + if (termDocs==null) termDocs = reader.termDocs(t); + else termDocs.seek(t); + return termDocs; + } + + protected boolean setTerm() { + t = tenum.term(); + if (t==null || t.field() != tindex.fterm.field()) { // intern'd compare + t = null; + return false; + } + return true; + } + + + public boolean next() throws IOException { + pos++; + boolean b = tenum.next(); + if (!b) { + t = null; + return false; + } + return setTerm(); // this is extra work if we know we are in bounds... + } + + public Term term() { + return t; + } + + public int docFreq() { + return tenum.docFreq(); + } + + public void close() throws IOException { + tenum.close(); + } + + public boolean skipTo(String target) throws IOException { + return skipTo(tindex.fterm.createTerm(target)); + } + + public boolean skipTo(Term target) throws IOException { + // already here + if (t != null && t.equals(target)) return true; + + int startIdx = Arrays.binarySearch(tindex.index,target.text()); + + if (startIdx >= 0) { + // we hit the term exactly... lucky us! + tenum = reader.terms(target); + pos = startIdx << tindex.intervalBits; + return setTerm(); + } + + // we didn't hit the term exactly + startIdx=-startIdx-1; + + if (startIdx == 0) { + // our target occurs *before* the first term + tenum = reader.terms(target); + pos = 0; + return setTerm(); + } + + // back up to the start of the block + startIdx--; + + if ((pos >> tindex.intervalBits) == startIdx && t != null && t.text().compareTo(target.text())<=0) { + // we are already in the right block and the current term is before the term we want, + // so we don't need to seek. + } else { + // seek to the right block + tenum = reader.terms(target.createTerm(tindex.index[startIdx])); + pos = startIdx << tindex.intervalBits; + setTerm(); // should be true since it's in the index + } + + + while (t != null && t.text().compareTo(target.text()) < 0) { + next(); + } + + return t != null; + } + + + public boolean skipTo(int termNumber) throws IOException { + int delta = termNumber - pos; + if (delta < 0 || delta > tindex.interval || tenum==null) { + int idx = termNumber >>> tindex.intervalBits; + String base = tindex.index[idx]; + pos = idx << tindex.intervalBits; + delta = termNumber - pos; + tenum = reader.terms(tindex.createTerm(base)); + } + while (--delta >= 0) { + boolean b = tenum.next(); + if (b==false) { + t = null; + return false; + } + ++pos; + } + return setTerm(); + } + + /** The current term number, starting at 0. + * Only valid if the previous call to next() or skipTo() returned true. + */ + public int getTermNumber() { + return pos; + } +} + + +/** + * Class to save memory by only storing every nth term (for random access), while + * numbering the terms, allowing them to be retrieved later by number. + * This is only valid when used with the IndexReader it was created with. + * The IndexReader is not actually stored to facilitate caching by using it as a key in + * a weak hash map. + */ +class TermIndex { + final static int intervalBits = 7; // decrease to a low number like 2 for testing + final static int intervalMask = 0xffffffff >>> (32-intervalBits); + final static int interval = 1 << intervalBits; + + final Term fterm; // prototype to be used in term construction w/o String.intern overhead + String[] index; + int nTerms; + long sizeOfStrings; + + TermIndex(String field) { + this.fterm = new Term(field, ""); + } + + Term createTerm(String termVal) { + return fterm.createTerm(termVal); + } + + NumberedTermEnum getEnumerator(IndexReader reader, int termNumber) throws IOException { + NumberedTermEnum te = new NumberedTermEnum(reader, this); + te.skipTo(termNumber); + return te; + } + + /* The first time an enumerator is requested, it should be used + with next() to fully traverse all of the terms so the index + will be built. + */ + NumberedTermEnum getEnumerator(IndexReader reader) throws IOException { + if (index==null) return new NumberedTermEnum(reader,this,"",0) { + ArrayList lst; + + protected boolean setTerm() { + boolean b = super.setTerm(); + if (b && (pos & intervalMask)==0) { + String text = term().text(); + sizeOfStrings += text.length() << 1; + if (lst==null) { + lst = new ArrayList(); + } + lst.add(text); + } + return b; + } + + public boolean skipTo(Term target) throws IOException { + throw new UnsupportedOperationException(); + } + + public boolean skipTo(int termNumber) throws IOException { + throw new UnsupportedOperationException(); + } + + public void close() throws IOException { + nTerms=pos; + super.close(); + index = lst!=null ? lst.toArray(new String[lst.size()]) : new String[0]; + } + }; + else return new NumberedTermEnum(reader,this,"",0); + } + + + /** + * Returns the approximate amount of memory taken by this DocSet. + * This is only an approximation and doesn't take into account java object overhead. + * + * @return + * the approximate memory consumption in bytes + */ + public long memSize() { + // assume 8 byte references? + return 8+8+8+8+(index.length<<3)+sizeOfStrings; + } +} + diff --git a/src/test/org/apache/solr/request/SimpleFacetsTest.java b/src/test/org/apache/solr/request/SimpleFacetsTest.java index ad3d8110de2..8bccacf4e9f 100644 --- a/src/test/org/apache/solr/request/SimpleFacetsTest.java +++ b/src/test/org/apache/solr/request/SimpleFacetsTest.java @@ -303,9 +303,10 @@ public class SimpleFacetsTest extends AbstractSolrTestCase { } public void testFacetMultiValued() { - doFacets("t_s"); - doFacets("t_s", "facet.enum.cache.minDf", "2"); - doFacets("t_s", "facet.enum.cache.minDf", "100"); + doFacetPrefix("t_s", "facet.method","enum"); + doFacetPrefix("t_s", "facet.method", "enum", "facet.enum.cache.minDf", "2"); + doFacetPrefix("t_s", "facet.method", "enum", "facet.enum.cache.minDf", "100"); + doFacetPrefix("t_s", "facet.method", "fc"); } public void testFacetSingleValued() { @@ -476,9 +477,10 @@ public class SimpleFacetsTest extends AbstractSolrTestCase { public void testFacetPrefixMultiValued() { - doFacetPrefix("t_s"); - doFacetPrefix("t_s", "facet.enum.cache.minDf", "3"); - doFacetPrefix("t_s", "facet.enum.cache.minDf", "100"); + doFacetPrefix("t_s", "facet.method","enum"); + doFacetPrefix("t_s", "facet.method", "enum", "facet.enum.cache.minDf", "3"); + doFacetPrefix("t_s", "facet.method", "enum", "facet.enum.cache.minDf", "100"); + doFacetPrefix("t_s", "facet.method", "fc"); } public void testFacetPrefixSingleValued() { diff --git a/src/test/org/apache/solr/request/TestFaceting.java b/src/test/org/apache/solr/request/TestFaceting.java new file mode 100755 index 00000000000..d1101489d54 --- /dev/null +++ b/src/test/org/apache/solr/request/TestFaceting.java @@ -0,0 +1,242 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.request; + +import org.apache.lucene.index.Term; +import org.apache.solr.util.AbstractSolrTestCase; + +import java.util.Random; + +/** + * @version $Id$ + */ +public class TestFaceting extends AbstractSolrTestCase { + + public String getSchemaFile() { return "schema11.xml"; } + public String getSolrConfigFile() { return "solrconfig.xml"; } + + public void setUp() throws Exception { + super.setUp(); + } + public void tearDown() throws Exception { + close(); + super.tearDown(); + } + + String t(int tnum) { + return String.format("%08d", tnum); + } + + void createIndex(int nTerms) { + assertU(delQ("*:*")); + for (int i=0; i0, te.skipTo(proto.createTerm("000"))); + assertEquals(0, te.getTermNumber()); + if (size>0) { + assertEquals(t(0), te.term().text()); + } else { + assertEquals(null, te.term()); + } + + if (size>0) { + // test seeking by term number + for (int i=0; i +