mirror of https://github.com/apache/lucene.git
SOLR-1177 -- Distributed Search support for TermsComponent
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@890199 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
6e9af5f74d
commit
df3fe93d71
|
@ -62,6 +62,8 @@ New Features
|
|||
|
||||
* SOLR-1139: Add TermsComponent Query and Response Support in SolrJ (Matt Weber via shalin)
|
||||
|
||||
* SOLR-1177: Distributed Search support for TermsComponent (Matt Weber via shalin)
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -45,6 +45,7 @@ public class ResponseBuilder
|
|||
public boolean doHighlights;
|
||||
public boolean doFacets;
|
||||
public boolean doStats;
|
||||
public boolean doTerms;
|
||||
|
||||
private boolean needDocList = false;
|
||||
private boolean needDocSet = false;
|
||||
|
@ -132,6 +133,7 @@ public class ResponseBuilder
|
|||
/* private... components that don't own these shouldn't use them */
|
||||
SolrDocumentList _responseDocs;
|
||||
StatsInfo _statsInfo;
|
||||
TermsComponent.TermsHelper _termsHelper;
|
||||
|
||||
/**
|
||||
* Utility function to add debugging info. This will make sure a valid
|
||||
|
|
|
@ -36,6 +36,7 @@ public class ShardRequest {
|
|||
public final static int PURPOSE_GET_HIGHLIGHTS = 0x80;
|
||||
public final static int PURPOSE_GET_DEBUG =0x100;
|
||||
public final static int PURPOSE_GET_STATS =0x200;
|
||||
public final static int PURPOSE_GET_TERMS =0x400;
|
||||
|
||||
public int purpose; // the purpose of this request
|
||||
|
||||
|
|
|
@ -20,15 +20,22 @@ import org.apache.lucene.index.Term;
|
|||
import org.apache.lucene.index.TermEnum;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.common.params.TermsParams;
|
||||
import org.apache.solr.common.params.*;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.common.util.SimpleOrderedMap;
|
||||
import org.apache.solr.common.util.StrUtils;
|
||||
import org.apache.solr.schema.FieldType;
|
||||
import org.apache.solr.schema.StrField;
|
||||
import org.apache.solr.request.SimpleFacets.CountPair;
|
||||
import org.apache.solr.util.BoundedTreeSet;
|
||||
|
||||
import org.apache.solr.client.solrj.response.TermsResponse;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
|
@ -39,7 +46,25 @@ import java.util.regex.Pattern;
|
|||
*/
|
||||
public class TermsComponent extends SearchComponent {
|
||||
public static final int UNLIMITED_MAX_COUNT = -1;
|
||||
public static final String COMPONENT_NAME = "terms";
|
||||
|
||||
@Override
|
||||
public void prepare(ResponseBuilder rb) throws IOException {
|
||||
SolrParams params = rb.req.getParams();
|
||||
if (params.getBool(TermsParams.TERMS, false)) {
|
||||
rb.doTerms = true;
|
||||
}
|
||||
|
||||
// TODO: temporary... this should go in a different component.
|
||||
String shards = params.get(ShardParams.SHARDS);
|
||||
if (shards != null) {
|
||||
if (params.get(ShardParams.SHARDS_QT) == null) {
|
||||
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "No shards.qt parameter specified");
|
||||
}
|
||||
List<String> lst = StrUtils.splitSmart(shards, ",", true);
|
||||
rb.shards = lst.toArray(new String[lst.size()]);
|
||||
}
|
||||
}
|
||||
|
||||
public void process(ResponseBuilder rb) throws IOException {
|
||||
SolrParams params = rb.req.getParams();
|
||||
|
@ -169,8 +194,232 @@ public class TermsComponent extends SearchComponent {
|
|||
return flags;
|
||||
}
|
||||
|
||||
public void prepare(ResponseBuilder rb) throws IOException {
|
||||
//nothing to do
|
||||
@Override
|
||||
public int distributedProcess(ResponseBuilder rb) throws IOException {
|
||||
if (!rb.doTerms) {
|
||||
return ResponseBuilder.STAGE_DONE;
|
||||
}
|
||||
|
||||
if (rb.stage == ResponseBuilder.STAGE_EXECUTE_QUERY) {
|
||||
TermsHelper th = rb._termsHelper;
|
||||
if (th == null) {
|
||||
th = rb._termsHelper = new TermsHelper();
|
||||
th.init(rb.req.getParams());
|
||||
}
|
||||
ShardRequest sreq = createShardQuery(rb.req.getParams());
|
||||
rb.addRequest(this, sreq);
|
||||
}
|
||||
|
||||
if (rb.stage < ResponseBuilder.STAGE_EXECUTE_QUERY) {
|
||||
return ResponseBuilder.STAGE_EXECUTE_QUERY;
|
||||
} else {
|
||||
return ResponseBuilder.STAGE_DONE;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void handleResponses(ResponseBuilder rb, ShardRequest sreq) {
|
||||
if (!rb.doTerms || (sreq.purpose & ShardRequest.PURPOSE_GET_TERMS) == 0) {
|
||||
return;
|
||||
}
|
||||
TermsHelper th = rb._termsHelper;
|
||||
if (th != null) {
|
||||
for (ShardResponse srsp : sreq.responses) {
|
||||
th.parse((NamedList) srsp.getSolrResponse().getResponse().get("terms"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void finishStage(ResponseBuilder rb) {
|
||||
if (!rb.doTerms || rb.stage != ResponseBuilder.STAGE_EXECUTE_QUERY) {
|
||||
return;
|
||||
}
|
||||
|
||||
TermsHelper ti = rb._termsHelper;
|
||||
NamedList terms = ti.buildResponse();
|
||||
|
||||
rb.rsp.add("terms", terms);
|
||||
rb._termsHelper = null;
|
||||
}
|
||||
|
||||
private ShardRequest createShardQuery(SolrParams params) {
|
||||
ShardRequest sreq = new ShardRequest();
|
||||
sreq.purpose = ShardRequest.PURPOSE_GET_TERMS;
|
||||
|
||||
// base shard request on original parameters
|
||||
sreq.params = new ModifiableSolrParams(params);
|
||||
|
||||
// don't pass through the shards param
|
||||
sreq.params.remove(ShardParams.SHARDS);
|
||||
|
||||
// remove any limits for shards, we want them to return all possible
|
||||
// responses
|
||||
// we want this so we can calculate the correct counts
|
||||
// dont sort by count to avoid that unnecessary overhead on the shards
|
||||
sreq.params.remove(TermsParams.TERMS_MAXCOUNT);
|
||||
sreq.params.remove(TermsParams.TERMS_MINCOUNT);
|
||||
sreq.params.set(TermsParams.TERMS_LIMIT, -1);
|
||||
sreq.params.set(TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_INDEX);
|
||||
|
||||
// TODO: is there a better way to handle this?
|
||||
String qt = params.get(CommonParams.QT);
|
||||
if (qt != null) {
|
||||
sreq.params.add(CommonParams.QT, qt);
|
||||
}
|
||||
return sreq;
|
||||
}
|
||||
|
||||
public class TermsHelper {
|
||||
// map to store returned terms
|
||||
private HashMap<String, HashMap<String, TermsResponse.Term>> fieldmap;
|
||||
private SolrParams params;
|
||||
|
||||
public TermsHelper() {
|
||||
fieldmap = new HashMap<String, HashMap<String, TermsResponse.Term>>(5);
|
||||
}
|
||||
|
||||
public void init(SolrParams params) {
|
||||
this.params = params;
|
||||
String[] fields = params.getParams(TermsParams.TERMS_FIELD);
|
||||
if (fields != null) {
|
||||
for (String field : fields) {
|
||||
// TODO: not sure 128 is the best starting size
|
||||
// It use it because that is what is used for facets
|
||||
fieldmap.put(field, new HashMap<String, TermsResponse.Term>(128));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void parse(NamedList terms) {
|
||||
// exit if there is no terms
|
||||
if (terms == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
TermsResponse termsResponse = new TermsResponse(terms);
|
||||
|
||||
// loop though each field and add each term+freq to map
|
||||
for (String key : fieldmap.keySet()) {
|
||||
HashMap<String, TermsResponse.Term> termmap = fieldmap.get(key);
|
||||
List<TermsResponse.Term> termlist = termsResponse.getTerms(key);
|
||||
|
||||
// skip this field if there are no terms
|
||||
if (termlist == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// loop though each term
|
||||
for (TermsResponse.Term tc : termlist) {
|
||||
String term = tc.getTerm();
|
||||
if (termmap.containsKey(term)) {
|
||||
TermsResponse.Term oldtc = termmap.get(term);
|
||||
oldtc.addFrequency(tc.getFrequency());
|
||||
termmap.put(term, oldtc);
|
||||
} else {
|
||||
termmap.put(term, tc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public NamedList buildResponse() {
|
||||
NamedList response = new SimpleOrderedMap();
|
||||
|
||||
// determine if we are going index or count sort
|
||||
boolean sort = !TermsParams.TERMS_SORT_INDEX.equals(params.get(
|
||||
TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_COUNT));
|
||||
|
||||
// init minimum frequency
|
||||
long freqmin = 1;
|
||||
String s = params.get(TermsParams.TERMS_MINCOUNT);
|
||||
if (s != null) freqmin = Long.parseLong(s);
|
||||
|
||||
// init maximum frequency, default to max int
|
||||
long freqmax = -1;
|
||||
s = params.get(TermsParams.TERMS_MAXCOUNT);
|
||||
if (s != null) freqmax = Long.parseLong(s);
|
||||
if (freqmax < 0) {
|
||||
freqmax = Long.MAX_VALUE;
|
||||
}
|
||||
|
||||
// init limit, default to max int
|
||||
long limit = 10;
|
||||
s = params.get(TermsParams.TERMS_LIMIT);
|
||||
if (s != null) limit = Long.parseLong(s);
|
||||
if (limit < 0) {
|
||||
limit = Long.MAX_VALUE;
|
||||
}
|
||||
|
||||
// loop though each field we want terms from
|
||||
for (String key : fieldmap.keySet()) {
|
||||
NamedList fieldterms = new SimpleOrderedMap();
|
||||
TermsResponse.Term[] data = null;
|
||||
if (sort) {
|
||||
data = getCountSorted(fieldmap.get(key));
|
||||
} else {
|
||||
data = getLexSorted(fieldmap.get(key));
|
||||
}
|
||||
|
||||
// loop though each term until we hit limit
|
||||
int cnt = 0;
|
||||
for (TermsResponse.Term tc : data) {
|
||||
if (tc.getFrequency() >= freqmin && tc.getFrequency() <= freqmax) {
|
||||
fieldterms.add(tc.getTerm(), num(tc.getFrequency()));
|
||||
cnt++;
|
||||
}
|
||||
|
||||
if (cnt >= limit) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
response.add(key, fieldterms);
|
||||
}
|
||||
|
||||
return response;
|
||||
}
|
||||
|
||||
// use <int> tags for smaller facet counts (better back compatibility)
|
||||
private Number num(long val) {
|
||||
if (val < Integer.MAX_VALUE) return (int) val;
|
||||
else return val;
|
||||
}
|
||||
|
||||
// based on code from facets
|
||||
public TermsResponse.Term[] getLexSorted(HashMap<String, TermsResponse.Term> data) {
|
||||
TermsResponse.Term[] arr = data.values().toArray(new TermsResponse.Term[data.size()]);
|
||||
|
||||
Arrays.sort(arr, new Comparator<TermsResponse.Term>() {
|
||||
public int compare(TermsResponse.Term o1, TermsResponse.Term o2) {
|
||||
return o1.getTerm().compareTo(o2.getTerm());
|
||||
}
|
||||
});
|
||||
|
||||
return arr;
|
||||
}
|
||||
|
||||
// based on code from facets
|
||||
public TermsResponse.Term[] getCountSorted(HashMap<String, TermsResponse.Term> data) {
|
||||
TermsResponse.Term[] arr = data.values().toArray(new TermsResponse.Term[data.size()]);
|
||||
|
||||
Arrays.sort(arr, new Comparator<TermsResponse.Term>() {
|
||||
public int compare(TermsResponse.Term o1, TermsResponse.Term o2) {
|
||||
long freq1 = o1.getFrequency();
|
||||
long freq2 = o2.getFrequency();
|
||||
|
||||
if (freq2 < freq1) {
|
||||
return -1;
|
||||
} else if (freq1 < freq2) {
|
||||
return 1;
|
||||
} else {
|
||||
return o1.getTerm().compareTo(o2.getTerm());
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return arr;
|
||||
}
|
||||
}
|
||||
|
||||
public String getVersion() {
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
package org.apache.solr.handler.component;
|
||||
|
||||
import org.apache.solr.BaseDistributedSearchTestCase;
|
||||
|
||||
/**
|
||||
* Test for TermsComponent distributed querying
|
||||
*
|
||||
* @version $Id$
|
||||
* @since solr 1.5
|
||||
*/
|
||||
public class DistributedTermsComponentTest extends BaseDistributedSearchTestCase {
|
||||
|
||||
@Override
|
||||
public void doTest() throws Exception {
|
||||
index(id, 18, "b_t", "snake spider shark snail slug seal");
|
||||
index(id, 19, "b_t", "snake spider shark snail slug");
|
||||
index(id, 20, "b_t", "snake spider shark snail");
|
||||
index(id, 21, "b_t", "snake spider shark");
|
||||
index(id, 22, "b_t", "snake spider");
|
||||
index(id, 23, "b_t", "snake");
|
||||
index(id, 24, "b_t", "ant zebra");
|
||||
index(id, 25, "b_t", "zebra");
|
||||
commit();
|
||||
|
||||
handle.clear();
|
||||
handle.put("QTime", SKIPVAL);
|
||||
|
||||
query("qt", "/terms", "shards.qt", "/terms", "terms", "true", "terms.fl", "b_t");
|
||||
query("qt", "/terms", "shards.qt", "/terms", "terms.limit", 5, "terms", "true", "terms.fl", "b_t", "terms.lower", "s");
|
||||
query("qt", "/terms", "shards.qt", "/terms", "terms.limit", 5, "terms", "true", "terms.fl", "b_t", "terms.prefix", "sn", "terms.lower", "sn");
|
||||
query("qt", "/terms", "shards.qt", "/terms", "terms.limit", 5, "terms", "true", "terms.fl", "b_t", "terms.prefix", "s", "terms.lower", "s", "terms.upper", "sn");
|
||||
query("qt", "/terms", "shards.qt", "/terms", "terms.limit", 5, "terms", "true", "terms.fl", "b_t", "terms.prefix", "s", "terms.lower", "s", "terms.sort", "index");
|
||||
query("qt", "/terms", "shards.qt", "/terms", "terms.limit", 5, "terms", "true", "terms.fl", "b_t", "terms.prefix", "s", "terms.lower", "s", "terms.upper", "sn", "terms.sort", "index");
|
||||
query("qt", "/terms", "shards.qt", "/terms", "terms", "true", "terms.fl", "b_t", "terms.sort", "index");
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue