SOLR-10349: Add totalTermFreq support to TermsComponent

TermsComponent only returns docFreq information per requested term.
This commit adds a terms.ttf parameter, which if set to true, will
return both docFreq and totalTermFreq statistics for each requested
term.
This commit is contained in:
Shai Erera 2017-03-23 08:28:05 +02:00
parent 144091ad29
commit deddc9b5c8
7 changed files with 126 additions and 42 deletions

View File

@ -126,6 +126,8 @@ New Features
* SOLR-6736: Adding support for uploading zipped configsets using ConfigSets API (Varun Rajput, Ishan Chattopadhyaya, * SOLR-6736: Adding support for uploading zipped configsets using ConfigSets API (Varun Rajput, Ishan Chattopadhyaya,
Noble Paul, Anshum Gupta, Gregory Chanan) Noble Paul, Anshum Gupta, Gregory Chanan)
* SOLR-10349: Add totalTermFreq support to TermsComponent. (Shai Erera)
Optimizations Optimizations
---------------------- ----------------------

View File

@ -109,7 +109,8 @@ public class TermsComponent extends SearchComponent {
String termList = params.get(TermsParams.TERMS_LIST); String termList = params.get(TermsParams.TERMS_LIST);
if (termList != null) { if (termList != null) {
fetchTerms(rb.req.getSearcher(), fields, termList, termsResult); boolean includeTotalTermFreq = params.getBool(TermsParams.TERMS_TTF, false);
fetchTerms(rb.req.getSearcher(), fields, termList, includeTotalTermFreq, termsResult);
return; return;
} }
@ -303,7 +304,7 @@ public class TermsComponent extends SearchComponent {
if (th != null) { if (th != null) {
for (ShardResponse srsp : sreq.responses) { for (ShardResponse srsp : sreq.responses) {
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
NamedList<NamedList<Number>> terms = (NamedList<NamedList<Number>>) srsp.getSolrResponse().getResponse().get("terms"); NamedList<NamedList<Object>> terms = (NamedList<NamedList<Object>>) srsp.getSolrResponse().getResponse().get("terms");
th.parse(terms); th.parse(terms);
@ -376,7 +377,7 @@ public class TermsComponent extends SearchComponent {
} }
} }
public void parse(NamedList<NamedList<Number>> terms) { public void parse(NamedList<NamedList<Object>> terms) {
// exit if there is no terms // exit if there is no terms
if (terms == null) { if (terms == null) {
return; return;
@ -400,6 +401,7 @@ public class TermsComponent extends SearchComponent {
if (termmap.containsKey(term)) { if (termmap.containsKey(term)) {
TermsResponse.Term oldtc = termmap.get(term); TermsResponse.Term oldtc = termmap.get(term);
oldtc.addFrequency(tc.getFrequency()); oldtc.addFrequency(tc.getFrequency());
oldtc.addTotalTermFreq(tc.getTotalTermFreq());
termmap.put(term, oldtc); termmap.put(term, oldtc);
} else { } else {
termmap.put(term, tc); termmap.put(term, tc);
@ -442,7 +444,7 @@ public class TermsComponent extends SearchComponent {
// loop though each field we want terms from // loop though each field we want terms from
for (String key : fieldmap.keySet()) { for (String key : fieldmap.keySet()) {
NamedList<Number> fieldterms = new SimpleOrderedMap<>(); NamedList<Object> fieldterms = new SimpleOrderedMap<>();
TermsResponse.Term[] data = null; TermsResponse.Term[] data = null;
if (sort) { if (sort) {
data = getCountSorted(fieldmap.get(key)); data = getCountSorted(fieldmap.get(key));
@ -450,11 +452,19 @@ public class TermsComponent extends SearchComponent {
data = getLexSorted(fieldmap.get(key)); data = getLexSorted(fieldmap.get(key));
} }
boolean includeTotalTermFreq = params.getBool(TermsParams.TERMS_TTF, false);
// loop though each term until we hit limit // loop though each term until we hit limit
int cnt = 0; int cnt = 0;
for (TermsResponse.Term tc : data) { for (TermsResponse.Term tc : data) {
if (tc.getFrequency() >= freqmin && tc.getFrequency() <= freqmax) { if (tc.getFrequency() >= freqmin && tc.getFrequency() <= freqmax) {
if (includeTotalTermFreq) {
NamedList<Number> termStats = new SimpleOrderedMap<>();
termStats.add("docFreq", tc.getFrequency());
termStats.add("totalTermFreq", tc.getTotalTermFreq());
fieldterms.add(tc.getTerm(), termStats);
} else {
fieldterms.add(tc.getTerm(), num(tc.getFrequency())); fieldterms.add(tc.getTerm(), num(tc.getFrequency()));
}
cnt++; cnt++;
} }
@ -508,10 +518,9 @@ public class TermsComponent extends SearchComponent {
private void fetchTerms(SolrIndexSearcher indexSearcher, private void fetchTerms(SolrIndexSearcher indexSearcher,
String[] fields, String[] fields,
String termList, String termList,
boolean includeTotalTermFreq,
NamedList result) throws IOException { NamedList result) throws IOException {
NamedList termsMap = new SimpleOrderedMap();
List<LeafReaderContext> leaves = indexSearcher.getTopReaderContext().leaves();
String field = fields[0]; String field = fields[0];
FieldType fieldType = indexSearcher.getSchema().getField(field).getType(); FieldType fieldType = indexSearcher.getSchema().getField(field).getType();
String[] splitTerms = termList.split(","); String[] splitTerms = termList.split(",");
@ -521,35 +530,43 @@ public class TermsComponent extends SearchComponent {
} }
Term[] terms = new Term[splitTerms.length]; Term[] terms = new Term[splitTerms.length];
TermContext[] termContexts = new TermContext[terms.length];
for(int i=0; i<splitTerms.length; i++) { for(int i=0; i<splitTerms.length; i++) {
terms[i] = new Term(field, fieldType.readableToIndexed(splitTerms[i])); terms[i] = new Term(field, fieldType.readableToIndexed(splitTerms[i]));
} }
Arrays.sort(terms); Arrays.sort(terms);
collectTermContext(indexSearcher.getTopReaderContext().reader(), leaves, termContexts, terms); IndexReaderContext topReaderContext = indexSearcher.getTopReaderContext();
TermContext[] termContexts = new TermContext[terms.length];
collectTermContext(topReaderContext, termContexts, terms);
NamedList termsMap = new SimpleOrderedMap();
for (int i = 0; i < terms.length; i++) { for (int i = 0; i < terms.length; i++) {
if (termContexts[i] != null) { if (termContexts[i] != null) {
String outTerm = fieldType.indexedToReadable(terms[i].bytes().utf8ToString()); String outTerm = fieldType.indexedToReadable(terms[i].bytes().utf8ToString());
int docFreq = termContexts[i].docFreq(); int docFreq = termContexts[i].docFreq();
if (!includeTotalTermFreq) {
termsMap.add(outTerm, docFreq); termsMap.add(outTerm, docFreq);
} else {
long totalTermFreq = termContexts[i].totalTermFreq();
NamedList<Long> termStats = new SimpleOrderedMap<>();
termStats.add("docFreq", (long) docFreq);
termStats.add("totalTermFreq", totalTermFreq);
termsMap.add(outTerm, termStats);
}
} }
} }
result.add(field, termsMap); result.add(field, termsMap);
} }
private void collectTermContext(IndexReader reader, private void collectTermContext(IndexReaderContext topReaderContext, TermContext[] contextArray, Term[] queryTerms)
List<LeafReaderContext> leaves, TermContext[] contextArray, throws IOException {
Term[] queryTerms) throws IOException {
TermsEnum termsEnum = null; TermsEnum termsEnum = null;
for (LeafReaderContext context : leaves) { for (LeafReaderContext context : topReaderContext.leaves()) {
final Fields fields = context.reader().fields(); final Fields fields = context.reader().fields();
for (int i = 0; i < queryTerms.length; i++) { for (int i = 0; i < queryTerms.length; i++) {
Term term = queryTerms[i]; Term term = queryTerms[i];
TermContext termContext = contextArray[i];
final Terms terms = fields.terms(term.field()); final Terms terms = fields.terms(term.field());
if (terms == null) { if (terms == null) {
// field does not exist // field does not exist
@ -559,18 +576,15 @@ public class TermsComponent extends SearchComponent {
assert termsEnum != null; assert termsEnum != null;
if (termsEnum == TermsEnum.EMPTY) continue; if (termsEnum == TermsEnum.EMPTY) continue;
TermContext termContext = contextArray[i];
if (termsEnum.seekExact(term.bytes())) { if (termsEnum.seekExact(term.bytes())) {
if (termContext == null) { if (termContext == null) {
contextArray[i] = new TermContext(reader.getContext(), termContext = new TermContext(topReaderContext);
termsEnum.termState(), context.ord, termsEnum.docFreq(), contextArray[i] = termContext;
termsEnum.totalTermFreq());
} else {
termContext.register(termsEnum.termState(), context.ord,
termsEnum.docFreq(), termsEnum.totalTermFreq());
} }
termContext.accumulateStatistics(termsEnum.docFreq(), termsEnum.totalTermFreq());
} }
} }
} }
} }

View File

@ -52,7 +52,6 @@ public class DistributedTermsComponentTest extends BaseDistributedSearchTestCase
query("qt", "/terms", "shards.qt", "/terms", "terms", "true", "terms.fl", "b_t", "terms.list", "snake, zebra, ant, bad"); query("qt", "/terms", "shards.qt", "/terms", "terms", "true", "terms.fl", "b_t", "terms.list", "snake, zebra, ant, bad");
query("qt", "/terms", "shards.qt", "/terms", "terms", "true", "terms.fl", "foo_i", "terms.list", "2, 3, 1"); query("qt", "/terms", "shards.qt", "/terms", "terms", "true", "terms.fl", "foo_i", "terms.list", "2, 3, 1");
query("qt", "/terms", "shards.qt", "/terms", "terms", "true", "terms.fl", "foo_i", "terms.stats", "true","terms.list", "2, 3, 1"); query("qt", "/terms", "shards.qt", "/terms", "terms", "true", "terms.fl", "foo_i", "terms.stats", "true","terms.list", "2, 3, 1");
query("qt", "/terms", "shards.qt", "/terms", "terms", "true", "terms.fl", "b_t", "terms.list", "snake, zebra", "terms.ttf", "true");
} }
} }

View File

@ -18,6 +18,7 @@ package org.apache.solr.handler.component;
import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.TermsParams; import org.apache.solr.common.params.TermsParams;
import org.apache.solr.request.SolrQueryRequest;
import org.junit.BeforeClass; import org.junit.BeforeClass;
import org.junit.Test; import org.junit.Test;
@ -313,4 +314,41 @@ public class TermsComponentTest extends SolrTestCaseJ4 {
,"count(//lst[@name='standardfilt']/*)=3" ,"count(//lst[@name='standardfilt']/*)=3"
); );
} }
@Test
public void testDocFreqAndTotalTermFreq() throws Exception {
SolrQueryRequest req = req(
"indent","true",
"qt", "/terms",
"terms", "true",
"terms.fl", "standardfilt",
"terms.ttf", "true",
"terms.list", "snake,spider,shark,ddddd");
assertQ(req,
"count(//lst[@name='standardfilt']/*)=4",
"//lst[@name='standardfilt']/lst[@name='ddddd']/long[@name='docFreq'][.='4']",
"//lst[@name='standardfilt']/lst[@name='ddddd']/long[@name='totalTermFreq'][.='4']",
"//lst[@name='standardfilt']/lst[@name='shark']/long[@name='docFreq'][.='2']",
"//lst[@name='standardfilt']/lst[@name='shark']/long[@name='totalTermFreq'][.='2']",
"//lst[@name='standardfilt']/lst[@name='snake']/long[@name='docFreq'][.='3']",
"//lst[@name='standardfilt']/lst[@name='snake']/long[@name='totalTermFreq'][.='3']",
"//lst[@name='standardfilt']/lst[@name='spider']/long[@name='docFreq'][.='1']",
"//lst[@name='standardfilt']/lst[@name='spider']/long[@name='totalTermFreq'][.='1']");
}
@Test
public void testDocFreqAndTotalTermFreqForNonExistingTerm() throws Exception {
SolrQueryRequest req = req(
"indent","true",
"qt", "/terms",
"terms", "true",
"terms.fl", "standardfilt",
"terms.ttf", "true",
"terms.list", "boo,snake");
assertQ(req,
"count(//lst[@name='standardfilt']/*)=1",
"//lst[@name='standardfilt']/lst[@name='snake']/long[@name='docFreq'][.='3']",
"//lst[@name='standardfilt']/lst[@name='snake']/long[@name='totalTermFreq'][.='3']");
}
} }

View File

@ -50,7 +50,7 @@ public class QueryResponse extends SolrResponseBase
private List<NamedList<Object>> _clusterInfo = null; private List<NamedList<Object>> _clusterInfo = null;
private Map<String,NamedList<Object>> _suggestInfo = null; private Map<String,NamedList<Object>> _suggestInfo = null;
private NamedList<Object> _statsInfo = null; private NamedList<Object> _statsInfo = null;
private NamedList<NamedList<Number>> _termsInfo = null; private NamedList<NamedList<Object>> _termsInfo = null;
private NamedList<SolrDocumentList> _moreLikeThisInfo = null; private NamedList<SolrDocumentList> _moreLikeThisInfo = null;
private String _cursorMarkNext = null; private String _cursorMarkNext = null;
@ -166,7 +166,7 @@ public class QueryResponse extends SolrResponseBase
extractStatsInfo( _statsInfo ); extractStatsInfo( _statsInfo );
} }
else if ( "terms".equals( n ) ) { else if ( "terms".equals( n ) ) {
_termsInfo = (NamedList<NamedList<Number>>) res.getVal( i ); _termsInfo = (NamedList<NamedList<Object>>) res.getVal( i );
extractTermsInfo( _termsInfo ); extractTermsInfo( _termsInfo );
} }
else if ( "moreLikeThis".equals( n ) ) { else if ( "moreLikeThis".equals( n ) ) {
@ -191,7 +191,7 @@ public class QueryResponse extends SolrResponseBase
_suggestResponse = new SuggesterResponse(suggestInfo); _suggestResponse = new SuggesterResponse(suggestInfo);
} }
private void extractTermsInfo(NamedList<NamedList<Number>> termsInfo) { private void extractTermsInfo(NamedList<NamedList<Object>> termsInfo) {
_termsResponse = new TermsResponse(termsInfo); _termsResponse = new TermsResponse(termsInfo);
} }

View File

@ -28,14 +28,23 @@ import java.util.Map;
public class TermsResponse { public class TermsResponse {
private Map<String, List<Term>> termMap = new HashMap<>(); private Map<String, List<Term>> termMap = new HashMap<>();
public TermsResponse(NamedList<NamedList<Number>> termsInfo) { public TermsResponse(NamedList<NamedList<Object>> termsInfo) {
for (int i = 0; i < termsInfo.size(); i++) { for (int i = 0; i < termsInfo.size(); i++) {
String fieldName = termsInfo.getName(i); String fieldName = termsInfo.getName(i);
List<Term> itemList = new ArrayList<>(); List<Term> itemList = new ArrayList<>();
NamedList<Number> items = termsInfo.getVal(i); NamedList<Object> items = termsInfo.getVal(i);
for (int j = 0; j < items.size(); j++) { for (int j = 0; j < items.size(); j++) {
Term t = new Term(items.getName(j), items.getVal(j).longValue()); String term = items.getName(j);
Object val = items.getVal(j);
Term t;
if (val instanceof NamedList) {
@SuppressWarnings("unchecked")
NamedList<Number> termStats = (NamedList<Number>) val;
t = new Term(term, termStats.get("docFreq").longValue(), termStats.get("totalTermFreq").longValue());
} else {
t = new Term(term, ((Number) val).longValue());
}
itemList.add(t); itemList.add(t);
} }
@ -59,10 +68,16 @@ public class TermsResponse {
public static class Term { public static class Term {
private String term; private String term;
private long frequency; private long frequency;
private long totalTermFreq;
public Term(String term, long frequency) { public Term(String term, long frequency) {
this(term, frequency, 0);
}
public Term(String term, long frequency, long totalTermFreq) {
this.term = term; this.term = term;
this.frequency = frequency; this.frequency = frequency;
this.totalTermFreq = totalTermFreq;
} }
public String getTerm() { public String getTerm() {
@ -84,5 +99,17 @@ public class TermsResponse {
public void addFrequency(long frequency) { public void addFrequency(long frequency) {
this.frequency += frequency; this.frequency += frequency;
} }
public long getTotalTermFreq() {
return totalTermFreq;
}
public void setTotalTermFreq(long totalTermFreq) {
this.totalTermFreq = totalTermFreq;
}
public void addTotalTermFreq(long totalTermFreq) {
this.totalTermFreq += totalTermFreq;
}
} }
} }

View File

@ -42,16 +42,19 @@ public interface TermsParams {
/** /**
* Optional. The list of terms to be retrieved. * Optional. The list of terms to be retrieved.
*
*/ */
public static final String TERMS_LIST = TERMS_PREFIX + "list"; public static final String TERMS_LIST = TERMS_PREFIX + "list";
/** /**
* Optional. The list of terms to be retrieved. * Optional. If true, also returns index-level statistics, such as numDocs.
*
*/ */
public static final String TERMS_STATS = TERMS_PREFIX + "stats"; public static final String TERMS_STATS = TERMS_PREFIX + "stats";
/**
* Optional. If true, also returns terms' total term frequency.
*/
public static final String TERMS_TTF = TERMS_PREFIX + "ttf";
/** /**
* Optional. The lower bound term to start at. The TermEnum will start at the next term after this term in the dictionary. * Optional. The lower bound term to start at. The TermEnum will start at the next term after this term in the dictionary.
* *
@ -111,6 +114,7 @@ public interface TermsParams {
* Optional. The minimum value of docFreq to be returned. 1 by default * Optional. The minimum value of docFreq to be returned. 1 by default
*/ */
public static final String TERMS_MINCOUNT = TERMS_PREFIX + "mincount"; public static final String TERMS_MINCOUNT = TERMS_PREFIX + "mincount";
/** /**
* Optional. The maximum value of docFreq to be returned. -1 by default means no boundary * Optional. The maximum value of docFreq to be returned. -1 by default means no boundary
*/ */