mirror of https://github.com/apache/lucene.git
SOLR-475:multi-valued faceting via un-inverted field
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@720403 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b3a2445d6b
commit
156491848a
|
@ -96,6 +96,11 @@ Optimizations
|
||||||
|
|
||||||
2. SOLR-808: Write string keys in Maps as extern strings in the javabin format. (Noble Paul via shalin)
|
2. SOLR-808: Write string keys in Maps as extern strings in the javabin format. (Noble Paul via shalin)
|
||||||
|
|
||||||
|
3. SOLR-475: New faceting method with better performance and smaller memory usage for
|
||||||
|
multi-valued fields with many unique values but relatively few values per document.
|
||||||
|
Controllable via the facet.method parameter - "fc" is the new default method and "enum"
|
||||||
|
is the original method. (yonik)
|
||||||
|
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
----------------------
|
----------------------
|
||||||
|
|
|
@ -29,6 +29,20 @@ public interface FacetParams {
|
||||||
* Should facet counts be calculated?
|
* Should facet counts be calculated?
|
||||||
*/
|
*/
|
||||||
public static final String FACET = "facet";
|
public static final String FACET = "facet";
|
||||||
|
|
||||||
|
/** What method should be used to do the faceting */
|
||||||
|
public static final String FACET_METHOD = FACET + ".method";
|
||||||
|
|
||||||
|
/** Value for FACET_METHOD param to indicate that Solr should enumerate over terms
|
||||||
|
* in a field to calculate the facet counts.
|
||||||
|
*/
|
||||||
|
public static final String FACET_METHOD_enum = "enum";
|
||||||
|
|
||||||
|
/** Value for FACET_METHOD param to indicate that Solr should enumerate over documents
|
||||||
|
* and count up terms by consulting an uninverted representation of the field values
|
||||||
|
* (such as the FieldCache used for sorting).
|
||||||
|
*/
|
||||||
|
public static final String FACET_METHOD_fc = "fc";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Any lucene formated queries the user would like to use for
|
* Any lucene formated queries the user would like to use for
|
||||||
|
|
|
@ -148,16 +148,32 @@ public class SimpleFacets {
|
||||||
boolean sort = params.getFieldBool(field, FacetParams.FACET_SORT, limit>0);
|
boolean sort = params.getFieldBool(field, FacetParams.FACET_SORT, limit>0);
|
||||||
String prefix = params.getFieldParam(field,FacetParams.FACET_PREFIX);
|
String prefix = params.getFieldParam(field,FacetParams.FACET_PREFIX);
|
||||||
|
|
||||||
|
|
||||||
NamedList counts;
|
NamedList counts;
|
||||||
SchemaField sf = searcher.getSchema().getField(field);
|
SchemaField sf = searcher.getSchema().getField(field);
|
||||||
FieldType ft = sf.getType();
|
FieldType ft = sf.getType();
|
||||||
if (sf.multiValued() || ft.isTokenized() || ft instanceof BoolField) {
|
|
||||||
|
// determine what type of faceting method to use
|
||||||
|
String method = params.getFieldParam(field, FacetParams.FACET_METHOD);
|
||||||
|
boolean enumMethod = FacetParams.FACET_METHOD_enum.equals(method);
|
||||||
|
if (method == null && ft instanceof BoolField) {
|
||||||
// Always use filters for booleans... we know the number of values is very small.
|
// Always use filters for booleans... we know the number of values is very small.
|
||||||
|
enumMethod = true;
|
||||||
|
}
|
||||||
|
boolean multiToken = sf.multiValued() || ft.isTokenized();
|
||||||
|
|
||||||
|
// unless the enum method is explicitly specified, use a counting method.
|
||||||
|
if (enumMethod) {
|
||||||
counts = getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount,missing,sort,prefix);
|
counts = getFacetTermEnumCounts(searcher, docs, field, offset, limit, mincount,missing,sort,prefix);
|
||||||
} else {
|
} else {
|
||||||
// TODO: future logic could use filters instead of the fieldcache if
|
if (multiToken) {
|
||||||
// the number of terms in the field is small enough.
|
UnInvertedField uif = UnInvertedField.getUnInvertedField(field, searcher);
|
||||||
counts = getFieldCacheCounts(searcher, docs, field, offset,limit, mincount, missing, sort, prefix);
|
counts = uif.getCounts(searcher, docs, offset, limit, mincount,missing,sort,prefix);
|
||||||
|
} else {
|
||||||
|
// TODO: future logic could use filters instead of the fieldcache if
|
||||||
|
// the number of terms in the field is small enough.
|
||||||
|
counts = getFieldCacheCounts(searcher, docs, field, offset,limit, mincount, missing, sort, prefix);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return counts;
|
return counts;
|
||||||
|
|
|
@ -0,0 +1,908 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.solr.request;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.index.TermDocs;
|
||||||
|
import org.apache.lucene.index.TermEnum;
|
||||||
|
import org.apache.lucene.search.TermQuery;
|
||||||
|
import org.apache.solr.common.util.NamedList;
|
||||||
|
import org.apache.solr.common.SolrException;
|
||||||
|
import org.apache.solr.core.SolrCore;
|
||||||
|
import org.apache.solr.request.SimpleFacets;
|
||||||
|
import org.apache.solr.schema.FieldType;
|
||||||
|
import org.apache.solr.search.BitDocSet;
|
||||||
|
import org.apache.solr.search.DocIterator;
|
||||||
|
import org.apache.solr.search.DocSet;
|
||||||
|
import org.apache.solr.search.SolrIndexSearcher;
|
||||||
|
import org.apache.solr.util.BoundedTreeSet;
|
||||||
|
import org.apache.solr.util.OpenBitSet;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.LinkedHashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.WeakHashMap;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* Final form of the un-inverted field:
|
||||||
|
* Each document points to a list of term numbers that are contained in that document.
|
||||||
|
*
|
||||||
|
* Term numbers are in sorted order, and are encoded as variable-length deltas from the
|
||||||
|
* previous term number. Real term numbers start at 2 since 0 and 1 are reserved. A
|
||||||
|
* term number of 0 signals the end of the termNumber list.
|
||||||
|
*
|
||||||
|
* There is a singe int[maxDoc()] which either contains a pointer into a byte[] for
|
||||||
|
* the termNumber lists, or directly contains the termNumber list if it fits in the 4
|
||||||
|
* bytes of an integer. If the first byte in the integer is 1, the next 3 bytes
|
||||||
|
* are a pointer into a byte[] where the termNumber list starts.
|
||||||
|
*
|
||||||
|
* There are actually 256 byte arrays, to compensate for the fact that the pointers
|
||||||
|
* into the byte arrays are only 3 bytes long. The correct byte array for a document
|
||||||
|
* is a function of it's id.
|
||||||
|
*
|
||||||
|
* To save space and speed up faceting, any term that matches enough documents will
|
||||||
|
* not be un-inverted... it will be skipped while building the un-inverted field structure,
|
||||||
|
* and will use a set intersection method during faceting.
|
||||||
|
*
|
||||||
|
* To further save memory, the terms (the actual string values) are not all stored in
|
||||||
|
* memory, but a TermIndex is used to convert term numbers to term values only
|
||||||
|
* for the terms needed after faceting has completed. Only every 128th term value
|
||||||
|
* is stored, along with it's corresponding term number, and this is used as an
|
||||||
|
* index to find the closest term and iterate until the desired number is hit (very
|
||||||
|
* much like Lucene's own internal term index).
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
class UnInvertedField {
|
||||||
|
private static int TNUM_OFFSET=2;
|
||||||
|
|
||||||
|
static class TopTerm {
|
||||||
|
Term term;
|
||||||
|
int termNum;
|
||||||
|
|
||||||
|
long memSize() {
|
||||||
|
return 8 + // obj header
|
||||||
|
8 + 8 +(term.text().length()<<1) + //term
|
||||||
|
4; // int
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
String field;
|
||||||
|
int numTermsInField;
|
||||||
|
int termsInverted; // number of unique terms that were un-inverted
|
||||||
|
long termInstances; // total number of references to term numbers
|
||||||
|
final TermIndex ti;
|
||||||
|
|
||||||
|
int[] index;
|
||||||
|
byte[][] tnums = new byte[256][];
|
||||||
|
int[] maxTermCounts;
|
||||||
|
final Map<Integer,TopTerm> bigTerms = new LinkedHashMap<Integer,TopTerm>();
|
||||||
|
|
||||||
|
|
||||||
|
public long memSize() {
|
||||||
|
long sz = 6*8 + 12; // local fields
|
||||||
|
sz += bigTerms.size() * 64;
|
||||||
|
for (TopTerm tt : bigTerms.values()) {
|
||||||
|
sz += tt.memSize();
|
||||||
|
}
|
||||||
|
if (index != null) sz += index.length * 4;
|
||||||
|
if (tnums!=null) {
|
||||||
|
for (byte[] arr : tnums)
|
||||||
|
if (arr != null) sz += arr.length;
|
||||||
|
}
|
||||||
|
if (maxTermCounts != null)
|
||||||
|
sz += maxTermCounts.length * 4;
|
||||||
|
sz += ti.memSize();
|
||||||
|
return sz;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** Number of bytes to represent an unsigned int as a vint. */
|
||||||
|
static int vIntSize(int x) {
|
||||||
|
if ((x & (0xffffffff << (7*1))) == 0 ) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if ((x & (0xffffffff << (7*2))) == 0 ) {
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
if ((x & (0xffffffff << (7*3))) == 0 ) {
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
if ((x & (0xffffffff << (7*4))) == 0 ) {
|
||||||
|
return 4;
|
||||||
|
}
|
||||||
|
return 5;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// todo: if we know the size of the vInt already, we could do
|
||||||
|
// a single switch on the size
|
||||||
|
static int writeInt(int x, byte[] arr, int pos) {
|
||||||
|
int a;
|
||||||
|
a = (x >>> (7*4));
|
||||||
|
if (a != 0) {
|
||||||
|
arr[pos++] = (byte)(a | 0x80);
|
||||||
|
}
|
||||||
|
a = (x >>> (7*3));
|
||||||
|
if (a != 0) {
|
||||||
|
arr[pos++] = (byte)(a | 0x80);
|
||||||
|
}
|
||||||
|
a = (x >>> (7*2));
|
||||||
|
if (a != 0) {
|
||||||
|
arr[pos++] = (byte)(a | 0x80);
|
||||||
|
}
|
||||||
|
a = (x >>> (7*1));
|
||||||
|
if (a != 0) {
|
||||||
|
arr[pos++] = (byte)(a | 0x80);
|
||||||
|
}
|
||||||
|
arr[pos++] = (byte)(x & 0x7f);
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
public UnInvertedField(String field, SolrIndexSearcher searcher) throws IOException {
|
||||||
|
this.field = field;
|
||||||
|
this.ti = new TermIndex(field);
|
||||||
|
uninvert(searcher);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void uninvert(SolrIndexSearcher searcher) throws IOException {
|
||||||
|
long startTime = System.currentTimeMillis();
|
||||||
|
|
||||||
|
IndexReader reader = searcher.getReader();
|
||||||
|
int maxDoc = reader.maxDoc();
|
||||||
|
|
||||||
|
int[] index = new int[maxDoc]; // immediate term numbers, or the index into the byte[] representing the last number
|
||||||
|
this.index = index;
|
||||||
|
final int[] lastTerm = new int[maxDoc]; // last term we saw for this document
|
||||||
|
final byte[][] bytes = new byte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts)
|
||||||
|
maxTermCounts = new int[1024];
|
||||||
|
|
||||||
|
NumberedTermEnum te = ti.getEnumerator(reader);
|
||||||
|
|
||||||
|
// threshold, over which we use set intersections instead of counting
|
||||||
|
// to (1) save memory, and (2) speed up faceting.
|
||||||
|
// Add 2 for testing purposes so that there will always be some terms under
|
||||||
|
// the threshold even when the index is very small.
|
||||||
|
int threshold = maxDoc / 20 + 2;
|
||||||
|
// threshold = 2000000000; //////////////////////////////// USE FOR TESTING
|
||||||
|
int[] docs = new int[1000];
|
||||||
|
int[] freqs = new int[1000];
|
||||||
|
|
||||||
|
// we need a minimum of 9 bytes, but round up to 12 since the space would
|
||||||
|
// be wasted with most allocators anyway.
|
||||||
|
byte[] tempArr = new byte[12];
|
||||||
|
|
||||||
|
//
|
||||||
|
// enumerate all terms, and build an intermediate form of the un-inverted field.
|
||||||
|
//
|
||||||
|
// During this intermediate form, every document has a (potential) byte[]
|
||||||
|
// and the int[maxDoc()] array either contains the termNumber list directly
|
||||||
|
// or the *end* offset of the termNumber list in it's byte array (for faster
|
||||||
|
// appending and faster creation of the final form).
|
||||||
|
//
|
||||||
|
// idea... if things are too large while building, we could do a range of docs
|
||||||
|
// at a time (but it would be a fair amount slower to build)
|
||||||
|
// could also do ranges in parallel to take advantage of multiple CPUs
|
||||||
|
|
||||||
|
// OPTIONAL: remap the largest df terms to the lowest 128 (single byte)
|
||||||
|
// values. This requires going over the field first to find the most
|
||||||
|
// frequent terms ahead of time.
|
||||||
|
|
||||||
|
for (;;) {
|
||||||
|
Term t = te.term();
|
||||||
|
if (t==null) break;
|
||||||
|
|
||||||
|
int termNum = te.getTermNumber();
|
||||||
|
|
||||||
|
if (termNum >= maxTermCounts.length) {
|
||||||
|
// resize, but conserve memory by not doubling
|
||||||
|
// resize at end??? we waste a maximum of 16K (average of 8K)
|
||||||
|
int[] newMaxTermCounts = new int[maxTermCounts.length+4096];
|
||||||
|
System.arraycopy(maxTermCounts, 0, newMaxTermCounts, 0, termNum);
|
||||||
|
maxTermCounts = newMaxTermCounts;
|
||||||
|
}
|
||||||
|
|
||||||
|
int df = te.docFreq();
|
||||||
|
if (df >= threshold) {
|
||||||
|
TopTerm topTerm = new TopTerm();
|
||||||
|
topTerm.term = t;
|
||||||
|
topTerm.termNum = termNum;
|
||||||
|
bigTerms.put(topTerm.termNum, topTerm);
|
||||||
|
|
||||||
|
DocSet set = searcher.getDocSet(new TermQuery(topTerm.term));
|
||||||
|
maxTermCounts[termNum] = set.size();
|
||||||
|
|
||||||
|
te.next();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
termsInverted++;
|
||||||
|
|
||||||
|
TermDocs td = te.getTermDocs();
|
||||||
|
td.seek(te);
|
||||||
|
for(;;) {
|
||||||
|
int n = td.read(docs,freqs);
|
||||||
|
if (n <= 0) break;
|
||||||
|
|
||||||
|
maxTermCounts[termNum] += n;
|
||||||
|
|
||||||
|
for (int i=0; i<n; i++) {
|
||||||
|
termInstances++;
|
||||||
|
int doc = docs[i];
|
||||||
|
// add 2 to the term number to make room for special reserved values:
|
||||||
|
// 0 (end term) and 1 (index into byte array follows)
|
||||||
|
int delta = termNum - lastTerm[doc] + TNUM_OFFSET;
|
||||||
|
lastTerm[doc] = termNum;
|
||||||
|
int val = index[doc];
|
||||||
|
|
||||||
|
if ((val & 0xff)==1) {
|
||||||
|
// index into byte array (actually the end of
|
||||||
|
// the doc-specific byte[] when building)
|
||||||
|
int pos = val >>> 8;
|
||||||
|
int ilen = vIntSize(delta);
|
||||||
|
byte[] arr = bytes[doc];
|
||||||
|
int newend = pos+ilen;
|
||||||
|
if (newend > arr.length) {
|
||||||
|
// We avoid a doubling strategy to lower memory usage.
|
||||||
|
// this faceting method isn't for docs with many terms.
|
||||||
|
// In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary.
|
||||||
|
// TODO: figure out what array lengths we can round up to w/o actually using more memory
|
||||||
|
// (how much space does a byte[] take up? Is data preceded by a 32 bit length only?
|
||||||
|
// It should be safe to round up to the nearest 32 bits in any case.
|
||||||
|
int newLen = (newend + 3) & 0xfffffffc; // 4 byte alignment
|
||||||
|
byte[] newarr = new byte[newLen];
|
||||||
|
System.arraycopy(arr, 0, newarr, 0, pos);
|
||||||
|
arr = newarr;
|
||||||
|
bytes[doc] = newarr;
|
||||||
|
}
|
||||||
|
pos = writeInt(delta, arr, pos);
|
||||||
|
index[doc] = (pos<<8) | 1; // update pointer to end index in byte[]
|
||||||
|
} else {
|
||||||
|
// OK, this int has data in it... find the end (a zero starting byte - not
|
||||||
|
// part of another number, hence not following a byte with the high bit set).
|
||||||
|
int ipos;
|
||||||
|
if (val==0) {
|
||||||
|
ipos=0;
|
||||||
|
} else if ((val & 0x0000ff80)==0) {
|
||||||
|
ipos=1;
|
||||||
|
} else if ((val & 0x00ff8000)==0) {
|
||||||
|
ipos=2;
|
||||||
|
} else if ((val & 0xff800000)==0) {
|
||||||
|
ipos=3;
|
||||||
|
} else {
|
||||||
|
ipos=4;
|
||||||
|
}
|
||||||
|
|
||||||
|
int endPos = writeInt(delta, tempArr, ipos);
|
||||||
|
if (endPos <= 4) {
|
||||||
|
// value will fit in the integer... move bytes back
|
||||||
|
for (int j=ipos; j<endPos; j++) {
|
||||||
|
val |= (tempArr[j] & 0xff) << (j<<3);
|
||||||
|
}
|
||||||
|
index[doc] = val;
|
||||||
|
} else {
|
||||||
|
// value won't fit... move integer into byte[]
|
||||||
|
for (int j=0; j<ipos; j++) {
|
||||||
|
tempArr[j] = (byte)val;
|
||||||
|
val >>>=8;
|
||||||
|
}
|
||||||
|
// point at the end index in the byte[]
|
||||||
|
index[doc] = (endPos<<8) | 1;
|
||||||
|
bytes[doc] = tempArr;
|
||||||
|
tempArr = new byte[12];
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
te.next();
|
||||||
|
}
|
||||||
|
|
||||||
|
numTermsInField = te.getTermNumber();
|
||||||
|
te.close();
|
||||||
|
|
||||||
|
long midPoint = System.currentTimeMillis();
|
||||||
|
|
||||||
|
if (termInstances == 0) {
|
||||||
|
// we didn't invert anything
|
||||||
|
// lower memory consumption.
|
||||||
|
index = this.index = null;
|
||||||
|
tnums = null;
|
||||||
|
} else {
|
||||||
|
|
||||||
|
//
|
||||||
|
// transform intermediate form into the final form, building a single byte[]
|
||||||
|
// at a time, and releasing the intermediate byte[]s as we go to avoid
|
||||||
|
// increasing the memory footprint.
|
||||||
|
//
|
||||||
|
for (int pass = 0; pass<256; pass++) {
|
||||||
|
byte[] target = tnums[pass];
|
||||||
|
int pos=0; // end in target;
|
||||||
|
if (target != null) {
|
||||||
|
pos = target.length;
|
||||||
|
} else {
|
||||||
|
target = new byte[4096];
|
||||||
|
}
|
||||||
|
|
||||||
|
// loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx
|
||||||
|
// where pp is the pass (which array we are building), and xx is all values.
|
||||||
|
// each pass shares the same byte[] for termNumber lists.
|
||||||
|
for (int docbase = pass<<16; docbase<maxDoc; docbase+=(1<<24)) {
|
||||||
|
int lim = Math.min(docbase + (1<<16), maxDoc);
|
||||||
|
for (int doc=docbase; doc<lim; doc++) {
|
||||||
|
int val = index[doc];
|
||||||
|
if ((val&0xff) == 1) {
|
||||||
|
int len = val >>> 8;
|
||||||
|
index[doc] = (pos<<8)|1; // change index to point to start of array
|
||||||
|
if ((pos & 0xff000000) != 0) {
|
||||||
|
// we only have 24 bits for the array index
|
||||||
|
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Too many values for UnInvertedField faceting on field "+field);
|
||||||
|
}
|
||||||
|
byte[] arr = bytes[doc];
|
||||||
|
bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM
|
||||||
|
if (target.length <= pos + len) {
|
||||||
|
int newlen = target.length;
|
||||||
|
/*** we don't have to worry about the array getting too large
|
||||||
|
* since the "pos" param will overflow first (only 24 bits available)
|
||||||
|
if ((newlen<<1) <= 0) {
|
||||||
|
// overflow...
|
||||||
|
newlen = Integer.MAX_VALUE;
|
||||||
|
if (newlen <= pos + len) {
|
||||||
|
throw new SolrException(400,"Too many terms to uninvert field!");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
while (newlen <= pos + len) newlen<<=1; // doubling strategy
|
||||||
|
}
|
||||||
|
****/
|
||||||
|
while (newlen <= pos + len) newlen<<=1; // doubling strategy
|
||||||
|
byte[] newtarget = new byte[newlen];
|
||||||
|
System.arraycopy(target, 0, newtarget, 0, pos);
|
||||||
|
target = newtarget;
|
||||||
|
}
|
||||||
|
System.arraycopy(arr, 0, target, pos, len);
|
||||||
|
pos += len + 1; // skip single byte at end and leave it 0 for terminator
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// shrink array
|
||||||
|
if (pos < target.length) {
|
||||||
|
byte[] newtarget = new byte[pos];
|
||||||
|
System.arraycopy(target, 0, newtarget, 0, pos);
|
||||||
|
target = newtarget;
|
||||||
|
}
|
||||||
|
|
||||||
|
tnums[pass] = target;
|
||||||
|
|
||||||
|
if ((pass << 16) > maxDoc)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
long endTime = System.currentTimeMillis();
|
||||||
|
|
||||||
|
SolrCore.log.info("UnInverted multi-valued field " + field + ", memSize=" + memSize()
|
||||||
|
+ ", time="+(endTime-startTime)+", phase1="+(midPoint-startTime)
|
||||||
|
+ ", nTerms=" + numTermsInField + ", bigTerms=" + bigTerms.size()
|
||||||
|
+ ", termInstances=" + termInstances
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
public NamedList getCounts(SolrIndexSearcher searcher, DocSet baseDocs, int offset, int limit, int mincount, boolean missing, boolean sort, String prefix) throws IOException {
|
||||||
|
FieldType ft = searcher.getSchema().getFieldType(field);
|
||||||
|
|
||||||
|
NamedList res = new NamedList(); // order is important
|
||||||
|
|
||||||
|
DocSet docs = baseDocs;
|
||||||
|
int baseSize = docs.size();
|
||||||
|
int maxDoc = searcher.maxDoc();
|
||||||
|
|
||||||
|
if (baseSize >= mincount) {
|
||||||
|
|
||||||
|
final int[] index = this.index;
|
||||||
|
final int[] counts = new int[numTermsInField];
|
||||||
|
|
||||||
|
//
|
||||||
|
// If there is prefix, find it's start and end term numbers
|
||||||
|
//
|
||||||
|
int startTerm = 0;
|
||||||
|
int endTerm = numTermsInField; // one past the end
|
||||||
|
|
||||||
|
NumberedTermEnum te = ti.getEnumerator(searcher.getReader());
|
||||||
|
if (prefix != null && prefix.length() > 0) {
|
||||||
|
te.skipTo(prefix);
|
||||||
|
startTerm = te.getTermNumber();
|
||||||
|
te.skipTo(prefix + "\uffff\uffff\uffff\uffff");
|
||||||
|
endTerm = te.getTermNumber();
|
||||||
|
}
|
||||||
|
|
||||||
|
/***********
|
||||||
|
// Alternative 2: get the docSet of the prefix (could take a while) and
|
||||||
|
// then do the intersection with the baseDocSet first.
|
||||||
|
if (prefix != null && prefix.length() > 0) {
|
||||||
|
docs = searcher.getDocSet(new ConstantScorePrefixQuery(new Term(field, ft.toInternal(prefix))), docs);
|
||||||
|
// The issue with this method are problems of returning 0 counts for terms w/o
|
||||||
|
// the prefix. We can't just filter out those terms later because it may
|
||||||
|
// mean that we didn't collect enough terms in the queue (in the sorted case).
|
||||||
|
}
|
||||||
|
***********/
|
||||||
|
|
||||||
|
boolean doNegative = baseSize > maxDoc >> 1 && termInstances > 0
|
||||||
|
&& startTerm==0 && endTerm==numTermsInField
|
||||||
|
&& docs instanceof BitDocSet;
|
||||||
|
|
||||||
|
if (doNegative) {
|
||||||
|
OpenBitSet bs = (OpenBitSet)((BitDocSet)docs).getBits().clone();
|
||||||
|
bs.flip(0, maxDoc);
|
||||||
|
// TODO: when iterator across negative elements is available, use that
|
||||||
|
// instead of creating a new bitset and inverting.
|
||||||
|
docs = new BitDocSet(bs, maxDoc - baseSize);
|
||||||
|
// simply negating will mean that we have deleted docs in the set.
|
||||||
|
// that should be OK, as their entries in our table should be empty.
|
||||||
|
}
|
||||||
|
|
||||||
|
// For the biggest terms, do straight set intersections
|
||||||
|
for (TopTerm tt : bigTerms.values()) {
|
||||||
|
// TODO: counts could be deferred if sorted==false
|
||||||
|
if (tt.termNum >= startTerm && tt.termNum < endTerm) {
|
||||||
|
counts[tt.termNum] = searcher.numDocs(new TermQuery(tt.term), docs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: we could short-circuit counting altogether for sorted faceting
|
||||||
|
// where we already have enough terms from the bigTerms
|
||||||
|
|
||||||
|
// TODO: we could shrink the size of the collection array, and
|
||||||
|
// additionally break when the termNumber got above endTerm, but
|
||||||
|
// it would require two extra conditionals in the inner loop (although
|
||||||
|
// they would be predictable for the non-prefix case).
|
||||||
|
// Perhaps a different copy of the code would be warranted.
|
||||||
|
|
||||||
|
if (termInstances > 0) {
|
||||||
|
DocIterator iter = docs.iterator();
|
||||||
|
while (iter.hasNext()) {
|
||||||
|
int doc = iter.nextDoc();
|
||||||
|
int code = index[doc];
|
||||||
|
|
||||||
|
if ((code & 0xff)==1) {
|
||||||
|
int pos = code>>>8;
|
||||||
|
int whichArray = (doc >>> 16) & 0xff;
|
||||||
|
byte[] arr = tnums[whichArray];
|
||||||
|
int tnum = 0;
|
||||||
|
for(;;) {
|
||||||
|
int delta = 0;
|
||||||
|
for(;;) {
|
||||||
|
byte b = arr[pos++];
|
||||||
|
delta = (delta << 7) | (b & 0x7f);
|
||||||
|
if ((b & 0x80) == 0) break;
|
||||||
|
}
|
||||||
|
if (delta == 0) break;
|
||||||
|
tnum += delta - TNUM_OFFSET;
|
||||||
|
counts[tnum]++;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
int tnum = 0;
|
||||||
|
int delta = 0;
|
||||||
|
for (;;) {
|
||||||
|
delta = (delta << 7) | (code & 0x7f);
|
||||||
|
if ((code & 0x80)==0) {
|
||||||
|
if (delta==0) break;
|
||||||
|
tnum += delta - TNUM_OFFSET;
|
||||||
|
counts[tnum]++;
|
||||||
|
delta = 0;
|
||||||
|
}
|
||||||
|
code >>>= 8;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int off=offset;
|
||||||
|
int lim=limit>=0 ? limit : Integer.MAX_VALUE;
|
||||||
|
|
||||||
|
if (sort) {
|
||||||
|
int maxsize = limit>0 ? offset+limit : Integer.MAX_VALUE-1;
|
||||||
|
maxsize = Math.min(maxsize, numTermsInField);
|
||||||
|
final BoundedTreeSet<Long> queue = new BoundedTreeSet<Long>(maxsize);
|
||||||
|
int min=mincount-1; // the smallest value in the top 'N' values
|
||||||
|
for (int i=startTerm; i<endTerm; i++) {
|
||||||
|
int c = doNegative ? maxTermCounts[i] - counts[i] : counts[i];
|
||||||
|
if (c>min) {
|
||||||
|
// NOTE: we use c>min rather than c>=min as an optimization because we are going in
|
||||||
|
// index order, so we already know that the keys are ordered. This can be very
|
||||||
|
// important if a lot of the counts are repeated (like zero counts would be).
|
||||||
|
|
||||||
|
// minimize object creation and speed comparison by creating a long that
|
||||||
|
// encompases both count and term number.
|
||||||
|
// Since smaller values are kept in the TreeSet, make higher counts smaller.
|
||||||
|
//
|
||||||
|
// for equal counts, lower term numbers
|
||||||
|
// should come first and hence be "greater"
|
||||||
|
|
||||||
|
//long pair = (((long)c)<<32) | (0x7fffffff-i) ; // use if priority queue
|
||||||
|
long pair = (((long)-c)<<32) | i;
|
||||||
|
queue.add(new Long(pair));
|
||||||
|
if (queue.size()>=maxsize) min=-(int)(queue.last().longValue() >>> 32);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// now select the right page from the results
|
||||||
|
for (Long p : queue) {
|
||||||
|
if (--off>=0) continue;
|
||||||
|
if (--lim<0) break;
|
||||||
|
int c = -(int)(p.longValue() >>> 32);
|
||||||
|
//int tnum = 0x7fffffff - (int)p.longValue(); // use if priority queue
|
||||||
|
int tnum = (int)p.longValue();
|
||||||
|
String label = ft.indexedToReadable(getTermText(te, tnum));
|
||||||
|
res.add(label, c);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// add results in index order
|
||||||
|
int i=startTerm;
|
||||||
|
if (mincount<=0) {
|
||||||
|
// if mincount<=0, then we won't discard any terms and we know exactly
|
||||||
|
// where to start.
|
||||||
|
i=startTerm+off;
|
||||||
|
off=0;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; i<endTerm; i++) {
|
||||||
|
int c = doNegative ? maxTermCounts[i] - counts[i] : counts[i];
|
||||||
|
if (c==0) {
|
||||||
|
|
||||||
|
}
|
||||||
|
if (c<mincount || --off>=0) continue;
|
||||||
|
if (--lim<0) break;
|
||||||
|
|
||||||
|
String label = ft.indexedToReadable(getTermText(te, i));
|
||||||
|
res.add(label, c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
te.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (missing) {
|
||||||
|
// TODO: a faster solution for this?
|
||||||
|
res.add(null, SimpleFacets.getFieldMissingCount(searcher, baseDocs, field));
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
String getTermText(NumberedTermEnum te, int termNum) throws IOException {
|
||||||
|
if (bigTerms.size() > 0) {
|
||||||
|
// see if the term is one of our big terms.
|
||||||
|
TopTerm tt = bigTerms.get(termNum);
|
||||||
|
if (tt != null) {
|
||||||
|
return tt.term.text();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
te.skipTo(termNum);
|
||||||
|
return te.term().text();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////
|
||||||
|
//////////////////////////// caching /////////////////////////////
|
||||||
|
//////////////////////////////////////////////////////////////////
|
||||||
|
static final class CreationPlaceholder {
|
||||||
|
Object value;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static UnInvertedField getUnInvertedField(String field, SolrIndexSearcher searcher) throws IOException {
|
||||||
|
return (UnInvertedField)multiValuedFieldCache.get(searcher, field);
|
||||||
|
}
|
||||||
|
|
||||||
|
static Cache multiValuedFieldCache = new Cache() {
|
||||||
|
protected Object createValue(SolrIndexSearcher searcher, Object key) throws IOException {
|
||||||
|
return new UnInvertedField((String)key, searcher);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Internal cache. (from lucene FieldCache) */
|
||||||
|
abstract static class Cache {
|
||||||
|
private final Map readerCache = new WeakHashMap();
|
||||||
|
|
||||||
|
protected abstract Object createValue(SolrIndexSearcher searcher, Object key) throws IOException;
|
||||||
|
|
||||||
|
public Object get(SolrIndexSearcher searcher, Object key) throws IOException {
|
||||||
|
Map innerCache;
|
||||||
|
Object value;
|
||||||
|
synchronized (readerCache) {
|
||||||
|
innerCache = (Map) readerCache.get(searcher);
|
||||||
|
if (innerCache == null) {
|
||||||
|
innerCache = new HashMap();
|
||||||
|
readerCache.put(searcher, innerCache);
|
||||||
|
value = null;
|
||||||
|
} else {
|
||||||
|
value = innerCache.get(key);
|
||||||
|
}
|
||||||
|
if (value == null) {
|
||||||
|
value = new CreationPlaceholder();
|
||||||
|
innerCache.put(key, value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (value instanceof CreationPlaceholder) {
|
||||||
|
synchronized (value) {
|
||||||
|
CreationPlaceholder progress = (CreationPlaceholder) value;
|
||||||
|
if (progress.value == null) {
|
||||||
|
progress.value = createValue(searcher, key);
|
||||||
|
synchronized (readerCache) {
|
||||||
|
innerCache.put(key, progress.value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return progress.value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// How to share TermDocs (int[] score[])???
|
||||||
|
// Hot to share TermPositions?
|
||||||
|
/***
|
||||||
|
class TermEnumListener {
|
||||||
|
void doTerm(Term t) {
|
||||||
|
}
|
||||||
|
void done() {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
***/
|
||||||
|
|
||||||
|
|
||||||
|
class NumberedTermEnum extends TermEnum {
|
||||||
|
protected final IndexReader reader;
|
||||||
|
protected final TermIndex tindex;
|
||||||
|
protected TermEnum tenum;
|
||||||
|
protected int pos=-1;
|
||||||
|
protected Term t;
|
||||||
|
protected TermDocs termDocs;
|
||||||
|
|
||||||
|
|
||||||
|
NumberedTermEnum(IndexReader reader, TermIndex tindex) throws IOException {
|
||||||
|
this.reader = reader;
|
||||||
|
this.tindex = tindex;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
NumberedTermEnum(IndexReader reader, TermIndex tindex, String termValue, int pos) throws IOException {
|
||||||
|
this.reader = reader;
|
||||||
|
this.tindex = tindex;
|
||||||
|
this.pos = pos;
|
||||||
|
tenum = reader.terms(tindex.createTerm(termValue));
|
||||||
|
setTerm();
|
||||||
|
}
|
||||||
|
|
||||||
|
public TermDocs getTermDocs() throws IOException {
|
||||||
|
if (termDocs==null) termDocs = reader.termDocs(t);
|
||||||
|
else termDocs.seek(t);
|
||||||
|
return termDocs;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected boolean setTerm() {
|
||||||
|
t = tenum.term();
|
||||||
|
if (t==null || t.field() != tindex.fterm.field()) { // intern'd compare
|
||||||
|
t = null;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean next() throws IOException {
|
||||||
|
pos++;
|
||||||
|
boolean b = tenum.next();
|
||||||
|
if (!b) {
|
||||||
|
t = null;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return setTerm(); // this is extra work if we know we are in bounds...
|
||||||
|
}
|
||||||
|
|
||||||
|
public Term term() {
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int docFreq() {
|
||||||
|
return tenum.docFreq();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() throws IOException {
|
||||||
|
tenum.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean skipTo(String target) throws IOException {
|
||||||
|
return skipTo(tindex.fterm.createTerm(target));
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean skipTo(Term target) throws IOException {
|
||||||
|
// already here
|
||||||
|
if (t != null && t.equals(target)) return true;
|
||||||
|
|
||||||
|
int startIdx = Arrays.binarySearch(tindex.index,target.text());
|
||||||
|
|
||||||
|
if (startIdx >= 0) {
|
||||||
|
// we hit the term exactly... lucky us!
|
||||||
|
tenum = reader.terms(target);
|
||||||
|
pos = startIdx << tindex.intervalBits;
|
||||||
|
return setTerm();
|
||||||
|
}
|
||||||
|
|
||||||
|
// we didn't hit the term exactly
|
||||||
|
startIdx=-startIdx-1;
|
||||||
|
|
||||||
|
if (startIdx == 0) {
|
||||||
|
// our target occurs *before* the first term
|
||||||
|
tenum = reader.terms(target);
|
||||||
|
pos = 0;
|
||||||
|
return setTerm();
|
||||||
|
}
|
||||||
|
|
||||||
|
// back up to the start of the block
|
||||||
|
startIdx--;
|
||||||
|
|
||||||
|
if ((pos >> tindex.intervalBits) == startIdx && t != null && t.text().compareTo(target.text())<=0) {
|
||||||
|
// we are already in the right block and the current term is before the term we want,
|
||||||
|
// so we don't need to seek.
|
||||||
|
} else {
|
||||||
|
// seek to the right block
|
||||||
|
tenum = reader.terms(target.createTerm(tindex.index[startIdx]));
|
||||||
|
pos = startIdx << tindex.intervalBits;
|
||||||
|
setTerm(); // should be true since it's in the index
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
while (t != null && t.text().compareTo(target.text()) < 0) {
|
||||||
|
next();
|
||||||
|
}
|
||||||
|
|
||||||
|
return t != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean skipTo(int termNumber) throws IOException {
|
||||||
|
int delta = termNumber - pos;
|
||||||
|
if (delta < 0 || delta > tindex.interval || tenum==null) {
|
||||||
|
int idx = termNumber >>> tindex.intervalBits;
|
||||||
|
String base = tindex.index[idx];
|
||||||
|
pos = idx << tindex.intervalBits;
|
||||||
|
delta = termNumber - pos;
|
||||||
|
tenum = reader.terms(tindex.createTerm(base));
|
||||||
|
}
|
||||||
|
while (--delta >= 0) {
|
||||||
|
boolean b = tenum.next();
|
||||||
|
if (b==false) {
|
||||||
|
t = null;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
++pos;
|
||||||
|
}
|
||||||
|
return setTerm();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** The current term number, starting at 0.
|
||||||
|
* Only valid if the previous call to next() or skipTo() returned true.
|
||||||
|
*/
|
||||||
|
public int getTermNumber() {
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class to save memory by only storing every nth term (for random access), while
|
||||||
|
* numbering the terms, allowing them to be retrieved later by number.
|
||||||
|
* This is only valid when used with the IndexReader it was created with.
|
||||||
|
* The IndexReader is not actually stored to facilitate caching by using it as a key in
|
||||||
|
* a weak hash map.
|
||||||
|
*/
|
||||||
|
class TermIndex {
|
||||||
|
final static int intervalBits = 7; // decrease to a low number like 2 for testing
|
||||||
|
final static int intervalMask = 0xffffffff >>> (32-intervalBits);
|
||||||
|
final static int interval = 1 << intervalBits;
|
||||||
|
|
||||||
|
final Term fterm; // prototype to be used in term construction w/o String.intern overhead
|
||||||
|
String[] index;
|
||||||
|
int nTerms;
|
||||||
|
long sizeOfStrings;
|
||||||
|
|
||||||
|
TermIndex(String field) {
|
||||||
|
this.fterm = new Term(field, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
Term createTerm(String termVal) {
|
||||||
|
return fterm.createTerm(termVal);
|
||||||
|
}
|
||||||
|
|
||||||
|
NumberedTermEnum getEnumerator(IndexReader reader, int termNumber) throws IOException {
|
||||||
|
NumberedTermEnum te = new NumberedTermEnum(reader, this);
|
||||||
|
te.skipTo(termNumber);
|
||||||
|
return te;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The first time an enumerator is requested, it should be used
|
||||||
|
with next() to fully traverse all of the terms so the index
|
||||||
|
will be built.
|
||||||
|
*/
|
||||||
|
NumberedTermEnum getEnumerator(IndexReader reader) throws IOException {
|
||||||
|
if (index==null) return new NumberedTermEnum(reader,this,"",0) {
|
||||||
|
ArrayList<String> lst;
|
||||||
|
|
||||||
|
protected boolean setTerm() {
|
||||||
|
boolean b = super.setTerm();
|
||||||
|
if (b && (pos & intervalMask)==0) {
|
||||||
|
String text = term().text();
|
||||||
|
sizeOfStrings += text.length() << 1;
|
||||||
|
if (lst==null) {
|
||||||
|
lst = new ArrayList<String>();
|
||||||
|
}
|
||||||
|
lst.add(text);
|
||||||
|
}
|
||||||
|
return b;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean skipTo(Term target) throws IOException {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean skipTo(int termNumber) throws IOException {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() throws IOException {
|
||||||
|
nTerms=pos;
|
||||||
|
super.close();
|
||||||
|
index = lst!=null ? lst.toArray(new String[lst.size()]) : new String[0];
|
||||||
|
}
|
||||||
|
};
|
||||||
|
else return new NumberedTermEnum(reader,this,"",0);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the approximate amount of memory taken by this DocSet.
|
||||||
|
* This is only an approximation and doesn't take into account java object overhead.
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
* the approximate memory consumption in bytes
|
||||||
|
*/
|
||||||
|
public long memSize() {
|
||||||
|
// assume 8 byte references?
|
||||||
|
return 8+8+8+8+(index.length<<3)+sizeOfStrings;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -303,9 +303,10 @@ public class SimpleFacetsTest extends AbstractSolrTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testFacetMultiValued() {
|
public void testFacetMultiValued() {
|
||||||
doFacets("t_s");
|
doFacetPrefix("t_s", "facet.method","enum");
|
||||||
doFacets("t_s", "facet.enum.cache.minDf", "2");
|
doFacetPrefix("t_s", "facet.method", "enum", "facet.enum.cache.minDf", "2");
|
||||||
doFacets("t_s", "facet.enum.cache.minDf", "100");
|
doFacetPrefix("t_s", "facet.method", "enum", "facet.enum.cache.minDf", "100");
|
||||||
|
doFacetPrefix("t_s", "facet.method", "fc");
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testFacetSingleValued() {
|
public void testFacetSingleValued() {
|
||||||
|
@ -476,9 +477,10 @@ public class SimpleFacetsTest extends AbstractSolrTestCase {
|
||||||
|
|
||||||
|
|
||||||
public void testFacetPrefixMultiValued() {
|
public void testFacetPrefixMultiValued() {
|
||||||
doFacetPrefix("t_s");
|
doFacetPrefix("t_s", "facet.method","enum");
|
||||||
doFacetPrefix("t_s", "facet.enum.cache.minDf", "3");
|
doFacetPrefix("t_s", "facet.method", "enum", "facet.enum.cache.minDf", "3");
|
||||||
doFacetPrefix("t_s", "facet.enum.cache.minDf", "100");
|
doFacetPrefix("t_s", "facet.method", "enum", "facet.enum.cache.minDf", "100");
|
||||||
|
doFacetPrefix("t_s", "facet.method", "fc");
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testFacetPrefixSingleValued() {
|
public void testFacetPrefixSingleValued() {
|
||||||
|
|
|
@ -0,0 +1,242 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.solr.request;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.solr.util.AbstractSolrTestCase;
|
||||||
|
|
||||||
|
import java.util.Random;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public class TestFaceting extends AbstractSolrTestCase {
|
||||||
|
|
||||||
|
public String getSchemaFile() { return "schema11.xml"; }
|
||||||
|
public String getSolrConfigFile() { return "solrconfig.xml"; }
|
||||||
|
|
||||||
|
public void setUp() throws Exception {
|
||||||
|
super.setUp();
|
||||||
|
}
|
||||||
|
public void tearDown() throws Exception {
|
||||||
|
close();
|
||||||
|
super.tearDown();
|
||||||
|
}
|
||||||
|
|
||||||
|
String t(int tnum) {
|
||||||
|
return String.format("%08d", tnum);
|
||||||
|
}
|
||||||
|
|
||||||
|
void createIndex(int nTerms) {
|
||||||
|
assertU(delQ("*:*"));
|
||||||
|
for (int i=0; i<nTerms; i++) {
|
||||||
|
assertU(adoc("id", Float.toString(i), proto.field(), t(i) ));
|
||||||
|
}
|
||||||
|
assertU(optimize()); // squeeze out any possible deleted docs
|
||||||
|
}
|
||||||
|
|
||||||
|
Term proto = new Term("field_s","");
|
||||||
|
SolrQueryRequest req; // used to get a searcher
|
||||||
|
void close() {
|
||||||
|
if (req!=null) req.close();
|
||||||
|
req = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
void doTermEnum(int size) throws Exception {
|
||||||
|
close();
|
||||||
|
createIndex(size);
|
||||||
|
req = lrf.makeRequest("q","*:*");
|
||||||
|
|
||||||
|
TermIndex ti = new TermIndex(proto.field());
|
||||||
|
NumberedTermEnum te = ti.getEnumerator(req.getSearcher().getReader());
|
||||||
|
|
||||||
|
// iterate through first
|
||||||
|
while(te.term() != null) te.next();
|
||||||
|
assertEquals(size, te.getTermNumber());
|
||||||
|
te.close();
|
||||||
|
|
||||||
|
te = ti.getEnumerator(req.getSearcher().getReader());
|
||||||
|
|
||||||
|
Random r = new Random(size);
|
||||||
|
// test seeking by term string
|
||||||
|
for (int i=0; i<size*2+10; i++) {
|
||||||
|
int rnum = r.nextInt(size+2);
|
||||||
|
String s = t(rnum);
|
||||||
|
boolean b = te.skipTo(proto.createTerm(s));
|
||||||
|
assertEquals(b, rnum < size);
|
||||||
|
if (rnum < size) {
|
||||||
|
assertEquals(rnum, te.pos);
|
||||||
|
assertEquals(s, te.term().text());
|
||||||
|
} else {
|
||||||
|
assertEquals(null, te.term());
|
||||||
|
assertEquals(size, te.getTermNumber());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// test seeking before term
|
||||||
|
assertEquals(size>0, te.skipTo(proto.createTerm("000")));
|
||||||
|
assertEquals(0, te.getTermNumber());
|
||||||
|
if (size>0) {
|
||||||
|
assertEquals(t(0), te.term().text());
|
||||||
|
} else {
|
||||||
|
assertEquals(null, te.term());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (size>0) {
|
||||||
|
// test seeking by term number
|
||||||
|
for (int i=0; i<size*2+10; i++) {
|
||||||
|
int rnum = r.nextInt(size);
|
||||||
|
String s = t(rnum);
|
||||||
|
boolean b = te.skipTo(rnum);
|
||||||
|
assertEquals(true, b);
|
||||||
|
assertEquals(rnum, te.pos);
|
||||||
|
assertEquals(s, te.term().text());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testTermEnum() throws Exception {
|
||||||
|
doTermEnum(0);
|
||||||
|
doTermEnum(1);
|
||||||
|
doTermEnum(TermIndex.interval - 1); // test boundaries around the block size
|
||||||
|
doTermEnum(TermIndex.interval);
|
||||||
|
doTermEnum(TermIndex.interval + 1);
|
||||||
|
doTermEnum(TermIndex.interval * 2 + 2);
|
||||||
|
// doTermEnum(TermIndex.interval * 3 + 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testFacets() throws Exception {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
|
||||||
|
// go over 4096 to test some of the buffer resizing
|
||||||
|
for (int i=0; i<5000; i++) {
|
||||||
|
sb.append(t(i));
|
||||||
|
sb.append(' ');
|
||||||
|
}
|
||||||
|
|
||||||
|
assertU(adoc("id", "1", "many_ws", sb.toString()));
|
||||||
|
assertU(commit());
|
||||||
|
|
||||||
|
assertQ("check many tokens",
|
||||||
|
req("q", "id:1","indent","true"
|
||||||
|
,"facet", "true", "facet.method","fc"
|
||||||
|
,"facet.field", "many_ws"
|
||||||
|
,"facet.limit", "-1"
|
||||||
|
)
|
||||||
|
,"*[count(//lst[@name='many_ws']/int)=5000]"
|
||||||
|
,"//lst[@name='many_ws']/int[@name='" + t(0) + "'][.='1']"
|
||||||
|
,"//lst[@name='many_ws']/int[@name='" + t(1) + "'][.='1']"
|
||||||
|
,"//lst[@name='many_ws']/int[@name='" + t(2) + "'][.='1']"
|
||||||
|
,"//lst[@name='many_ws']/int[@name='" + t(3) + "'][.='1']"
|
||||||
|
,"//lst[@name='many_ws']/int[@name='" + t(4) + "'][.='1']"
|
||||||
|
,"//lst[@name='many_ws']/int[@name='" + t(5) + "'][.='1']"
|
||||||
|
,"//lst[@name='many_ws']/int[@name='" + t(4092) + "'][.='1']"
|
||||||
|
,"//lst[@name='many_ws']/int[@name='" + t(4093) + "'][.='1']"
|
||||||
|
,"//lst[@name='many_ws']/int[@name='" + t(4094) + "'][.='1']"
|
||||||
|
,"//lst[@name='many_ws']/int[@name='" + t(4095) + "'][.='1']"
|
||||||
|
,"//lst[@name='many_ws']/int[@name='" + t(4096) + "'][.='1']"
|
||||||
|
,"//lst[@name='many_ws']/int[@name='" + t(4097) + "'][.='1']"
|
||||||
|
,"//lst[@name='many_ws']/int[@name='" + t(4098) + "'][.='1']"
|
||||||
|
,"//lst[@name='many_ws']/int[@name='" + t(4090) + "'][.='1']"
|
||||||
|
,"//lst[@name='many_ws']/int[@name='" + t(4999) + "'][.='1']"
|
||||||
|
);
|
||||||
|
|
||||||
|
// test gaps that take more than one byte
|
||||||
|
sb = new StringBuilder();
|
||||||
|
sb.append(t(0)).append(' ');
|
||||||
|
sb.append(t(150)).append(' ');
|
||||||
|
sb.append(t(301)).append(' ');
|
||||||
|
sb.append(t(453)).append(' ');
|
||||||
|
sb.append(t(606)).append(' ');
|
||||||
|
sb.append(t(1000)).append(' ');
|
||||||
|
sb.append(t(2010)).append(' ');
|
||||||
|
sb.append(t(3050)).append(' ');
|
||||||
|
sb.append(t(4999)).append(' ');
|
||||||
|
assertU(adoc("id", "2", "many_ws", sb.toString()));
|
||||||
|
assertQ("check many tokens",
|
||||||
|
req("q", "id:1","indent","true"
|
||||||
|
,"facet", "true", "facet.method","fc"
|
||||||
|
,"facet.field", "many_ws"
|
||||||
|
,"facet.limit", "-1"
|
||||||
|
)
|
||||||
|
,"*[count(//lst[@name='many_ws']/int)=5000]"
|
||||||
|
,"//lst[@name='many_ws']/int[@name='" + t(0) + "'][.='1']"
|
||||||
|
,"//lst[@name='many_ws']/int[@name='" + t(150) + "'][.='1']"
|
||||||
|
,"//lst[@name='many_ws']/int[@name='" + t(301) + "'][.='1']"
|
||||||
|
,"//lst[@name='many_ws']/int[@name='" + t(453) + "'][.='1']"
|
||||||
|
,"//lst[@name='many_ws']/int[@name='" + t(606) + "'][.='1']"
|
||||||
|
,"//lst[@name='many_ws']/int[@name='" + t(1000) + "'][.='1']"
|
||||||
|
,"//lst[@name='many_ws']/int[@name='" + t(2010) + "'][.='1']"
|
||||||
|
,"//lst[@name='many_ws']/int[@name='" + t(3050) + "'][.='1']"
|
||||||
|
,"//lst[@name='many_ws']/int[@name='" + t(4999) + "'][.='1']"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRegularBig() throws Exception {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
|
||||||
|
// go over 4096 to test some of the buffer resizing
|
||||||
|
int nTerms=7;
|
||||||
|
for (int i=0; i<nTerms; i++) {
|
||||||
|
sb.append(t(i));
|
||||||
|
sb.append(' ');
|
||||||
|
}
|
||||||
|
String many_ws = sb.toString();
|
||||||
|
|
||||||
|
int i1=1000000;
|
||||||
|
|
||||||
|
// int iter=65536+10;
|
||||||
|
int iter=1000;
|
||||||
|
|
||||||
|
for (int i=0; i<iter; i++) {
|
||||||
|
// assertU(adoc("id", t(i), "many_ws", many_ws + t(i1+i) + " " + t(i1*2+i)));
|
||||||
|
assertU(adoc("id", t(i), "many_ws", t(i1+i) + " " + t(i1*2+i)));
|
||||||
|
}
|
||||||
|
assertU(commit());
|
||||||
|
|
||||||
|
for (int i=0; i<iter; i+=iter/10) {
|
||||||
|
assertQ("check many tokens",
|
||||||
|
req("q", "id:"+t(i),"indent","true"
|
||||||
|
,"facet", "true", "facet.method","fc"
|
||||||
|
,"facet.field", "many_ws"
|
||||||
|
,"facet.limit", "-1"
|
||||||
|
,"facet.mincount", "1"
|
||||||
|
)
|
||||||
|
,"*[count(//lst[@name='many_ws']/int)=" + 2 + "]"
|
||||||
|
,"//lst[@name='many_ws']/int[@name='" + t(i1+i) + "'][.='1']"
|
||||||
|
,"//lst[@name='many_ws']/int[@name='" + t(i1*2+i) + "'][.='1']"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
int i=iter-1;
|
||||||
|
assertQ("check many tokens",
|
||||||
|
req("q", "id:"+t(i),"indent","true"
|
||||||
|
,"facet", "true", "facet.method","fc"
|
||||||
|
,"facet.field", "many_ws"
|
||||||
|
,"facet.limit", "-1"
|
||||||
|
,"facet.mincount", "1"
|
||||||
|
|
||||||
|
)
|
||||||
|
,"*[count(//lst[@name='many_ws']/int)=" + 2 + "]"
|
||||||
|
,"//lst[@name='many_ws']/int[@name='" + t(i1+i) + "'][.='1']"
|
||||||
|
,"//lst[@name='many_ws']/int[@name='" + t(i1*2+i) + "'][.='1']"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
|
@ -285,6 +285,7 @@
|
||||||
<dynamicField name="*_t" type="text" indexed="true" stored="true"/>
|
<dynamicField name="*_t" type="text" indexed="true" stored="true"/>
|
||||||
<dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
|
<dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
|
||||||
<dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
|
<dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
|
||||||
|
<dynamicField name="*_ws" type="text_ws" indexed="true" stored="true"/>
|
||||||
|
|
||||||
<dynamicField name="*_extf" type="file"/>
|
<dynamicField name="*_extf" type="file"/>
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue