SOLR-10349: Add totalTermFreq support to TermsComponent

TermsComponent only returns docFreq information per requested term.
This commit adds a terms.ttf parameter, which if set to true, will
return both docFreq and totalTermFreq statistics for each requested
term.
This commit is contained in:
Shai Erera 2017-03-23 08:28:05 +02:00
parent 144091ad29
commit deddc9b5c8
7 changed files with 126 additions and 42 deletions

View File

@ -126,6 +126,8 @@ New Features
* SOLR-6736: Adding support for uploading zipped configsets using ConfigSets API (Varun Rajput, Ishan Chattopadhyaya,
Noble Paul, Anshum Gupta, Gregory Chanan)
* SOLR-10349: Add totalTermFreq support to TermsComponent. (Shai Erera)
Optimizations
----------------------

View File

@ -109,7 +109,8 @@ public class TermsComponent extends SearchComponent {
String termList = params.get(TermsParams.TERMS_LIST);
if (termList != null) {
fetchTerms(rb.req.getSearcher(), fields, termList, termsResult);
boolean includeTotalTermFreq = params.getBool(TermsParams.TERMS_TTF, false);
fetchTerms(rb.req.getSearcher(), fields, termList, includeTotalTermFreq, termsResult);
return;
}
@ -303,7 +304,7 @@ public class TermsComponent extends SearchComponent {
if (th != null) {
for (ShardResponse srsp : sreq.responses) {
@SuppressWarnings("unchecked")
NamedList<NamedList<Number>> terms = (NamedList<NamedList<Number>>) srsp.getSolrResponse().getResponse().get("terms");
NamedList<NamedList<Object>> terms = (NamedList<NamedList<Object>>) srsp.getSolrResponse().getResponse().get("terms");
th.parse(terms);
@ -376,7 +377,7 @@ public class TermsComponent extends SearchComponent {
}
}
public void parse(NamedList<NamedList<Number>> terms) {
public void parse(NamedList<NamedList<Object>> terms) {
// exit if there is no terms
if (terms == null) {
return;
@ -400,6 +401,7 @@ public class TermsComponent extends SearchComponent {
if (termmap.containsKey(term)) {
TermsResponse.Term oldtc = termmap.get(term);
oldtc.addFrequency(tc.getFrequency());
oldtc.addTotalTermFreq(tc.getTotalTermFreq());
termmap.put(term, oldtc);
} else {
termmap.put(term, tc);
@ -442,7 +444,7 @@ public class TermsComponent extends SearchComponent {
// loop though each field we want terms from
for (String key : fieldmap.keySet()) {
NamedList<Number> fieldterms = new SimpleOrderedMap<>();
NamedList<Object> fieldterms = new SimpleOrderedMap<>();
TermsResponse.Term[] data = null;
if (sort) {
data = getCountSorted(fieldmap.get(key));
@ -450,11 +452,19 @@ public class TermsComponent extends SearchComponent {
data = getLexSorted(fieldmap.get(key));
}
boolean includeTotalTermFreq = params.getBool(TermsParams.TERMS_TTF, false);
// loop though each term until we hit limit
int cnt = 0;
for (TermsResponse.Term tc : data) {
if (tc.getFrequency() >= freqmin && tc.getFrequency() <= freqmax) {
if (includeTotalTermFreq) {
NamedList<Number> termStats = new SimpleOrderedMap<>();
termStats.add("docFreq", tc.getFrequency());
termStats.add("totalTermFreq", tc.getTotalTermFreq());
fieldterms.add(tc.getTerm(), termStats);
} else {
fieldterms.add(tc.getTerm(), num(tc.getFrequency()));
}
cnt++;
}
@ -508,10 +518,9 @@ public class TermsComponent extends SearchComponent {
private void fetchTerms(SolrIndexSearcher indexSearcher,
String[] fields,
String termList,
boolean includeTotalTermFreq,
NamedList result) throws IOException {
NamedList termsMap = new SimpleOrderedMap();
List<LeafReaderContext> leaves = indexSearcher.getTopReaderContext().leaves();
String field = fields[0];
FieldType fieldType = indexSearcher.getSchema().getField(field).getType();
String[] splitTerms = termList.split(",");
@ -521,35 +530,43 @@ public class TermsComponent extends SearchComponent {
}
Term[] terms = new Term[splitTerms.length];
TermContext[] termContexts = new TermContext[terms.length];
for(int i=0; i<splitTerms.length; i++) {
terms[i] = new Term(field, fieldType.readableToIndexed(splitTerms[i]));
}
Arrays.sort(terms);
collectTermContext(indexSearcher.getTopReaderContext().reader(), leaves, termContexts, terms);
IndexReaderContext topReaderContext = indexSearcher.getTopReaderContext();
TermContext[] termContexts = new TermContext[terms.length];
collectTermContext(topReaderContext, termContexts, terms);
NamedList termsMap = new SimpleOrderedMap();
for (int i = 0; i < terms.length; i++) {
if (termContexts[i] != null) {
String outTerm = fieldType.indexedToReadable(terms[i].bytes().utf8ToString());
int docFreq = termContexts[i].docFreq();
if (!includeTotalTermFreq) {
termsMap.add(outTerm, docFreq);
} else {
long totalTermFreq = termContexts[i].totalTermFreq();
NamedList<Long> termStats = new SimpleOrderedMap<>();
termStats.add("docFreq", (long) docFreq);
termStats.add("totalTermFreq", totalTermFreq);
termsMap.add(outTerm, termStats);
}
}
}
result.add(field, termsMap);
}
private void collectTermContext(IndexReader reader,
List<LeafReaderContext> leaves, TermContext[] contextArray,
Term[] queryTerms) throws IOException {
private void collectTermContext(IndexReaderContext topReaderContext, TermContext[] contextArray, Term[] queryTerms)
throws IOException {
TermsEnum termsEnum = null;
for (LeafReaderContext context : leaves) {
for (LeafReaderContext context : topReaderContext.leaves()) {
final Fields fields = context.reader().fields();
for (int i = 0; i < queryTerms.length; i++) {
Term term = queryTerms[i];
TermContext termContext = contextArray[i];
final Terms terms = fields.terms(term.field());
if (terms == null) {
// field does not exist
@ -559,18 +576,15 @@ public class TermsComponent extends SearchComponent {
assert termsEnum != null;
if (termsEnum == TermsEnum.EMPTY) continue;
TermContext termContext = contextArray[i];
if (termsEnum.seekExact(term.bytes())) {
if (termContext == null) {
contextArray[i] = new TermContext(reader.getContext(),
termsEnum.termState(), context.ord, termsEnum.docFreq(),
termsEnum.totalTermFreq());
} else {
termContext.register(termsEnum.termState(), context.ord,
termsEnum.docFreq(), termsEnum.totalTermFreq());
termContext = new TermContext(topReaderContext);
contextArray[i] = termContext;
}
termContext.accumulateStatistics(termsEnum.docFreq(), termsEnum.totalTermFreq());
}
}
}
}

View File

@ -52,7 +52,6 @@ public class DistributedTermsComponentTest extends BaseDistributedSearchTestCase
query("qt", "/terms", "shards.qt", "/terms", "terms", "true", "terms.fl", "b_t", "terms.list", "snake, zebra, ant, bad");
query("qt", "/terms", "shards.qt", "/terms", "terms", "true", "terms.fl", "foo_i", "terms.list", "2, 3, 1");
query("qt", "/terms", "shards.qt", "/terms", "terms", "true", "terms.fl", "foo_i", "terms.stats", "true","terms.list", "2, 3, 1");
query("qt", "/terms", "shards.qt", "/terms", "terms", "true", "terms.fl", "b_t", "terms.list", "snake, zebra", "terms.ttf", "true");
}
}

View File

@ -18,6 +18,7 @@ package org.apache.solr.handler.component;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.TermsParams;
import org.apache.solr.request.SolrQueryRequest;
import org.junit.BeforeClass;
import org.junit.Test;
@ -313,4 +314,41 @@ public class TermsComponentTest extends SolrTestCaseJ4 {
,"count(//lst[@name='standardfilt']/*)=3"
);
}
@Test
public void testDocFreqAndTotalTermFreq() throws Exception {
SolrQueryRequest req = req(
"indent","true",
"qt", "/terms",
"terms", "true",
"terms.fl", "standardfilt",
"terms.ttf", "true",
"terms.list", "snake,spider,shark,ddddd");
assertQ(req,
"count(//lst[@name='standardfilt']/*)=4",
"//lst[@name='standardfilt']/lst[@name='ddddd']/long[@name='docFreq'][.='4']",
"//lst[@name='standardfilt']/lst[@name='ddddd']/long[@name='totalTermFreq'][.='4']",
"//lst[@name='standardfilt']/lst[@name='shark']/long[@name='docFreq'][.='2']",
"//lst[@name='standardfilt']/lst[@name='shark']/long[@name='totalTermFreq'][.='2']",
"//lst[@name='standardfilt']/lst[@name='snake']/long[@name='docFreq'][.='3']",
"//lst[@name='standardfilt']/lst[@name='snake']/long[@name='totalTermFreq'][.='3']",
"//lst[@name='standardfilt']/lst[@name='spider']/long[@name='docFreq'][.='1']",
"//lst[@name='standardfilt']/lst[@name='spider']/long[@name='totalTermFreq'][.='1']");
}
@Test
public void testDocFreqAndTotalTermFreqForNonExistingTerm() throws Exception {
SolrQueryRequest req = req(
"indent","true",
"qt", "/terms",
"terms", "true",
"terms.fl", "standardfilt",
"terms.ttf", "true",
"terms.list", "boo,snake");
assertQ(req,
"count(//lst[@name='standardfilt']/*)=1",
"//lst[@name='standardfilt']/lst[@name='snake']/long[@name='docFreq'][.='3']",
"//lst[@name='standardfilt']/lst[@name='snake']/long[@name='totalTermFreq'][.='3']");
}
}

View File

@ -50,7 +50,7 @@ public class QueryResponse extends SolrResponseBase
private List<NamedList<Object>> _clusterInfo = null;
private Map<String,NamedList<Object>> _suggestInfo = null;
private NamedList<Object> _statsInfo = null;
private NamedList<NamedList<Number>> _termsInfo = null;
private NamedList<NamedList<Object>> _termsInfo = null;
private NamedList<SolrDocumentList> _moreLikeThisInfo = null;
private String _cursorMarkNext = null;
@ -166,7 +166,7 @@ public class QueryResponse extends SolrResponseBase
extractStatsInfo( _statsInfo );
}
else if ( "terms".equals( n ) ) {
_termsInfo = (NamedList<NamedList<Number>>) res.getVal( i );
_termsInfo = (NamedList<NamedList<Object>>) res.getVal( i );
extractTermsInfo( _termsInfo );
}
else if ( "moreLikeThis".equals( n ) ) {
@ -191,7 +191,7 @@ public class QueryResponse extends SolrResponseBase
_suggestResponse = new SuggesterResponse(suggestInfo);
}
private void extractTermsInfo(NamedList<NamedList<Number>> termsInfo) {
private void extractTermsInfo(NamedList<NamedList<Object>> termsInfo) {
_termsResponse = new TermsResponse(termsInfo);
}

View File

@ -28,14 +28,23 @@ import java.util.Map;
public class TermsResponse {
private Map<String, List<Term>> termMap = new HashMap<>();
public TermsResponse(NamedList<NamedList<Number>> termsInfo) {
public TermsResponse(NamedList<NamedList<Object>> termsInfo) {
for (int i = 0; i < termsInfo.size(); i++) {
String fieldName = termsInfo.getName(i);
List<Term> itemList = new ArrayList<>();
NamedList<Number> items = termsInfo.getVal(i);
NamedList<Object> items = termsInfo.getVal(i);
for (int j = 0; j < items.size(); j++) {
Term t = new Term(items.getName(j), items.getVal(j).longValue());
String term = items.getName(j);
Object val = items.getVal(j);
Term t;
if (val instanceof NamedList) {
@SuppressWarnings("unchecked")
NamedList<Number> termStats = (NamedList<Number>) val;
t = new Term(term, termStats.get("docFreq").longValue(), termStats.get("totalTermFreq").longValue());
} else {
t = new Term(term, ((Number) val).longValue());
}
itemList.add(t);
}
@ -59,10 +68,16 @@ public class TermsResponse {
public static class Term {
private String term;
private long frequency;
private long totalTermFreq;
public Term(String term, long frequency) {
this(term, frequency, 0);
}
public Term(String term, long frequency, long totalTermFreq) {
this.term = term;
this.frequency = frequency;
this.totalTermFreq = totalTermFreq;
}
public String getTerm() {
@ -84,5 +99,17 @@ public class TermsResponse {
public void addFrequency(long frequency) {
this.frequency += frequency;
}
public long getTotalTermFreq() {
return totalTermFreq;
}
public void setTotalTermFreq(long totalTermFreq) {
this.totalTermFreq = totalTermFreq;
}
public void addTotalTermFreq(long totalTermFreq) {
this.totalTermFreq += totalTermFreq;
}
}
}

View File

@ -42,16 +42,19 @@ public interface TermsParams {
/**
* Optional. The list of terms to be retrieved.
*
*/
public static final String TERMS_LIST = TERMS_PREFIX + "list";
/**
* Optional. The list of terms to be retrieved.
*
* Optional. If true, also returns index-level statistics, such as numDocs.
*/
public static final String TERMS_STATS = TERMS_PREFIX + "stats";
/**
* Optional. If true, also returns terms' total term frequency.
*/
public static final String TERMS_TTF = TERMS_PREFIX + "ttf";
/**
* Optional. The lower bound term to start at. The TermEnum will start at the next term after this term in the dictionary.
*
@ -111,6 +114,7 @@ public interface TermsParams {
* Optional. The minimum value of docFreq to be returned. 1 by default
*/
public static final String TERMS_MINCOUNT = TERMS_PREFIX + "mincount";
/**
* Optional. The maximum value of docFreq to be returned. -1 by default means no boundary
*/