mirror of https://github.com/apache/lucene.git
SOLR-10349: Add totalTermFreq support to TermsComponent
TermsComponent only returns docFreq information per requested term. This commit adds a terms.ttf parameter, which if set to true, will return both docFreq and totalTermFreq statistics for each requested term.
This commit is contained in:
parent
144091ad29
commit
deddc9b5c8
|
@ -126,6 +126,8 @@ New Features
|
|||
* SOLR-6736: Adding support for uploading zipped configsets using ConfigSets API (Varun Rajput, Ishan Chattopadhyaya,
|
||||
Noble Paul, Anshum Gupta, Gregory Chanan)
|
||||
|
||||
* SOLR-10349: Add totalTermFreq support to TermsComponent. (Shai Erera)
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -109,7 +109,8 @@ public class TermsComponent extends SearchComponent {
|
|||
|
||||
String termList = params.get(TermsParams.TERMS_LIST);
|
||||
if (termList != null) {
|
||||
fetchTerms(rb.req.getSearcher(), fields, termList, termsResult);
|
||||
boolean includeTotalTermFreq = params.getBool(TermsParams.TERMS_TTF, false);
|
||||
fetchTerms(rb.req.getSearcher(), fields, termList, includeTotalTermFreq, termsResult);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -303,7 +304,7 @@ public class TermsComponent extends SearchComponent {
|
|||
if (th != null) {
|
||||
for (ShardResponse srsp : sreq.responses) {
|
||||
@SuppressWarnings("unchecked")
|
||||
NamedList<NamedList<Number>> terms = (NamedList<NamedList<Number>>) srsp.getSolrResponse().getResponse().get("terms");
|
||||
NamedList<NamedList<Object>> terms = (NamedList<NamedList<Object>>) srsp.getSolrResponse().getResponse().get("terms");
|
||||
th.parse(terms);
|
||||
|
||||
|
||||
|
@ -376,7 +377,7 @@ public class TermsComponent extends SearchComponent {
|
|||
}
|
||||
}
|
||||
|
||||
public void parse(NamedList<NamedList<Number>> terms) {
|
||||
public void parse(NamedList<NamedList<Object>> terms) {
|
||||
// exit if there is no terms
|
||||
if (terms == null) {
|
||||
return;
|
||||
|
@ -400,6 +401,7 @@ public class TermsComponent extends SearchComponent {
|
|||
if (termmap.containsKey(term)) {
|
||||
TermsResponse.Term oldtc = termmap.get(term);
|
||||
oldtc.addFrequency(tc.getFrequency());
|
||||
oldtc.addTotalTermFreq(tc.getTotalTermFreq());
|
||||
termmap.put(term, oldtc);
|
||||
} else {
|
||||
termmap.put(term, tc);
|
||||
|
@ -442,7 +444,7 @@ public class TermsComponent extends SearchComponent {
|
|||
|
||||
// loop though each field we want terms from
|
||||
for (String key : fieldmap.keySet()) {
|
||||
NamedList<Number> fieldterms = new SimpleOrderedMap<>();
|
||||
NamedList<Object> fieldterms = new SimpleOrderedMap<>();
|
||||
TermsResponse.Term[] data = null;
|
||||
if (sort) {
|
||||
data = getCountSorted(fieldmap.get(key));
|
||||
|
@ -450,11 +452,19 @@ public class TermsComponent extends SearchComponent {
|
|||
data = getLexSorted(fieldmap.get(key));
|
||||
}
|
||||
|
||||
boolean includeTotalTermFreq = params.getBool(TermsParams.TERMS_TTF, false);
|
||||
// loop though each term until we hit limit
|
||||
int cnt = 0;
|
||||
for (TermsResponse.Term tc : data) {
|
||||
if (tc.getFrequency() >= freqmin && tc.getFrequency() <= freqmax) {
|
||||
if (includeTotalTermFreq) {
|
||||
NamedList<Number> termStats = new SimpleOrderedMap<>();
|
||||
termStats.add("docFreq", tc.getFrequency());
|
||||
termStats.add("totalTermFreq", tc.getTotalTermFreq());
|
||||
fieldterms.add(tc.getTerm(), termStats);
|
||||
} else {
|
||||
fieldterms.add(tc.getTerm(), num(tc.getFrequency()));
|
||||
}
|
||||
cnt++;
|
||||
}
|
||||
|
||||
|
@ -508,10 +518,9 @@ public class TermsComponent extends SearchComponent {
|
|||
private void fetchTerms(SolrIndexSearcher indexSearcher,
|
||||
String[] fields,
|
||||
String termList,
|
||||
boolean includeTotalTermFreq,
|
||||
NamedList result) throws IOException {
|
||||
|
||||
NamedList termsMap = new SimpleOrderedMap();
|
||||
List<LeafReaderContext> leaves = indexSearcher.getTopReaderContext().leaves();
|
||||
String field = fields[0];
|
||||
FieldType fieldType = indexSearcher.getSchema().getField(field).getType();
|
||||
String[] splitTerms = termList.split(",");
|
||||
|
@ -521,35 +530,43 @@ public class TermsComponent extends SearchComponent {
|
|||
}
|
||||
|
||||
Term[] terms = new Term[splitTerms.length];
|
||||
TermContext[] termContexts = new TermContext[terms.length];
|
||||
for(int i=0; i<splitTerms.length; i++) {
|
||||
terms[i] = new Term(field, fieldType.readableToIndexed(splitTerms[i]));
|
||||
}
|
||||
|
||||
Arrays.sort(terms);
|
||||
|
||||
collectTermContext(indexSearcher.getTopReaderContext().reader(), leaves, termContexts, terms);
|
||||
IndexReaderContext topReaderContext = indexSearcher.getTopReaderContext();
|
||||
TermContext[] termContexts = new TermContext[terms.length];
|
||||
collectTermContext(topReaderContext, termContexts, terms);
|
||||
|
||||
NamedList termsMap = new SimpleOrderedMap();
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
if (termContexts[i] != null) {
|
||||
String outTerm = fieldType.indexedToReadable(terms[i].bytes().utf8ToString());
|
||||
int docFreq = termContexts[i].docFreq();
|
||||
if (!includeTotalTermFreq) {
|
||||
termsMap.add(outTerm, docFreq);
|
||||
} else {
|
||||
long totalTermFreq = termContexts[i].totalTermFreq();
|
||||
NamedList<Long> termStats = new SimpleOrderedMap<>();
|
||||
termStats.add("docFreq", (long) docFreq);
|
||||
termStats.add("totalTermFreq", totalTermFreq);
|
||||
termsMap.add(outTerm, termStats);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result.add(field, termsMap);
|
||||
}
|
||||
|
||||
private void collectTermContext(IndexReader reader,
|
||||
List<LeafReaderContext> leaves, TermContext[] contextArray,
|
||||
Term[] queryTerms) throws IOException {
|
||||
private void collectTermContext(IndexReaderContext topReaderContext, TermContext[] contextArray, Term[] queryTerms)
|
||||
throws IOException {
|
||||
TermsEnum termsEnum = null;
|
||||
for (LeafReaderContext context : leaves) {
|
||||
for (LeafReaderContext context : topReaderContext.leaves()) {
|
||||
final Fields fields = context.reader().fields();
|
||||
for (int i = 0; i < queryTerms.length; i++) {
|
||||
Term term = queryTerms[i];
|
||||
TermContext termContext = contextArray[i];
|
||||
final Terms terms = fields.terms(term.field());
|
||||
if (terms == null) {
|
||||
// field does not exist
|
||||
|
@ -559,18 +576,15 @@ public class TermsComponent extends SearchComponent {
|
|||
assert termsEnum != null;
|
||||
|
||||
if (termsEnum == TermsEnum.EMPTY) continue;
|
||||
|
||||
TermContext termContext = contextArray[i];
|
||||
if (termsEnum.seekExact(term.bytes())) {
|
||||
if (termContext == null) {
|
||||
contextArray[i] = new TermContext(reader.getContext(),
|
||||
termsEnum.termState(), context.ord, termsEnum.docFreq(),
|
||||
termsEnum.totalTermFreq());
|
||||
} else {
|
||||
termContext.register(termsEnum.termState(), context.ord,
|
||||
termsEnum.docFreq(), termsEnum.totalTermFreq());
|
||||
termContext = new TermContext(topReaderContext);
|
||||
contextArray[i] = termContext;
|
||||
}
|
||||
|
||||
termContext.accumulateStatistics(termsEnum.docFreq(), termsEnum.totalTermFreq());
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -52,7 +52,6 @@ public class DistributedTermsComponentTest extends BaseDistributedSearchTestCase
|
|||
query("qt", "/terms", "shards.qt", "/terms", "terms", "true", "terms.fl", "b_t", "terms.list", "snake, zebra, ant, bad");
|
||||
query("qt", "/terms", "shards.qt", "/terms", "terms", "true", "terms.fl", "foo_i", "terms.list", "2, 3, 1");
|
||||
query("qt", "/terms", "shards.qt", "/terms", "terms", "true", "terms.fl", "foo_i", "terms.stats", "true","terms.list", "2, 3, 1");
|
||||
|
||||
|
||||
query("qt", "/terms", "shards.qt", "/terms", "terms", "true", "terms.fl", "b_t", "terms.list", "snake, zebra", "terms.ttf", "true");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.solr.handler.component;
|
|||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.common.params.ModifiableSolrParams;
|
||||
import org.apache.solr.common.params.TermsParams;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
|
@ -313,4 +314,41 @@ public class TermsComponentTest extends SolrTestCaseJ4 {
|
|||
,"count(//lst[@name='standardfilt']/*)=3"
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDocFreqAndTotalTermFreq() throws Exception {
|
||||
SolrQueryRequest req = req(
|
||||
"indent","true",
|
||||
"qt", "/terms",
|
||||
"terms", "true",
|
||||
"terms.fl", "standardfilt",
|
||||
"terms.ttf", "true",
|
||||
"terms.list", "snake,spider,shark,ddddd");
|
||||
assertQ(req,
|
||||
"count(//lst[@name='standardfilt']/*)=4",
|
||||
"//lst[@name='standardfilt']/lst[@name='ddddd']/long[@name='docFreq'][.='4']",
|
||||
"//lst[@name='standardfilt']/lst[@name='ddddd']/long[@name='totalTermFreq'][.='4']",
|
||||
"//lst[@name='standardfilt']/lst[@name='shark']/long[@name='docFreq'][.='2']",
|
||||
"//lst[@name='standardfilt']/lst[@name='shark']/long[@name='totalTermFreq'][.='2']",
|
||||
"//lst[@name='standardfilt']/lst[@name='snake']/long[@name='docFreq'][.='3']",
|
||||
"//lst[@name='standardfilt']/lst[@name='snake']/long[@name='totalTermFreq'][.='3']",
|
||||
"//lst[@name='standardfilt']/lst[@name='spider']/long[@name='docFreq'][.='1']",
|
||||
"//lst[@name='standardfilt']/lst[@name='spider']/long[@name='totalTermFreq'][.='1']");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDocFreqAndTotalTermFreqForNonExistingTerm() throws Exception {
|
||||
SolrQueryRequest req = req(
|
||||
"indent","true",
|
||||
"qt", "/terms",
|
||||
"terms", "true",
|
||||
"terms.fl", "standardfilt",
|
||||
"terms.ttf", "true",
|
||||
"terms.list", "boo,snake");
|
||||
assertQ(req,
|
||||
"count(//lst[@name='standardfilt']/*)=1",
|
||||
"//lst[@name='standardfilt']/lst[@name='snake']/long[@name='docFreq'][.='3']",
|
||||
"//lst[@name='standardfilt']/lst[@name='snake']/long[@name='totalTermFreq'][.='3']");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -50,7 +50,7 @@ public class QueryResponse extends SolrResponseBase
|
|||
private List<NamedList<Object>> _clusterInfo = null;
|
||||
private Map<String,NamedList<Object>> _suggestInfo = null;
|
||||
private NamedList<Object> _statsInfo = null;
|
||||
private NamedList<NamedList<Number>> _termsInfo = null;
|
||||
private NamedList<NamedList<Object>> _termsInfo = null;
|
||||
private NamedList<SolrDocumentList> _moreLikeThisInfo = null;
|
||||
private String _cursorMarkNext = null;
|
||||
|
||||
|
@ -166,7 +166,7 @@ public class QueryResponse extends SolrResponseBase
|
|||
extractStatsInfo( _statsInfo );
|
||||
}
|
||||
else if ( "terms".equals( n ) ) {
|
||||
_termsInfo = (NamedList<NamedList<Number>>) res.getVal( i );
|
||||
_termsInfo = (NamedList<NamedList<Object>>) res.getVal( i );
|
||||
extractTermsInfo( _termsInfo );
|
||||
}
|
||||
else if ( "moreLikeThis".equals( n ) ) {
|
||||
|
@ -191,7 +191,7 @@ public class QueryResponse extends SolrResponseBase
|
|||
_suggestResponse = new SuggesterResponse(suggestInfo);
|
||||
}
|
||||
|
||||
private void extractTermsInfo(NamedList<NamedList<Number>> termsInfo) {
|
||||
private void extractTermsInfo(NamedList<NamedList<Object>> termsInfo) {
|
||||
_termsResponse = new TermsResponse(termsInfo);
|
||||
}
|
||||
|
||||
|
|
|
@ -28,14 +28,23 @@ import java.util.Map;
|
|||
public class TermsResponse {
|
||||
private Map<String, List<Term>> termMap = new HashMap<>();
|
||||
|
||||
public TermsResponse(NamedList<NamedList<Number>> termsInfo) {
|
||||
public TermsResponse(NamedList<NamedList<Object>> termsInfo) {
|
||||
for (int i = 0; i < termsInfo.size(); i++) {
|
||||
String fieldName = termsInfo.getName(i);
|
||||
List<Term> itemList = new ArrayList<>();
|
||||
NamedList<Number> items = termsInfo.getVal(i);
|
||||
NamedList<Object> items = termsInfo.getVal(i);
|
||||
|
||||
for (int j = 0; j < items.size(); j++) {
|
||||
Term t = new Term(items.getName(j), items.getVal(j).longValue());
|
||||
String term = items.getName(j);
|
||||
Object val = items.getVal(j);
|
||||
Term t;
|
||||
if (val instanceof NamedList) {
|
||||
@SuppressWarnings("unchecked")
|
||||
NamedList<Number> termStats = (NamedList<Number>) val;
|
||||
t = new Term(term, termStats.get("docFreq").longValue(), termStats.get("totalTermFreq").longValue());
|
||||
} else {
|
||||
t = new Term(term, ((Number) val).longValue());
|
||||
}
|
||||
itemList.add(t);
|
||||
}
|
||||
|
||||
|
@ -59,10 +68,16 @@ public class TermsResponse {
|
|||
public static class Term {
|
||||
private String term;
|
||||
private long frequency;
|
||||
private long totalTermFreq;
|
||||
|
||||
public Term(String term, long frequency) {
|
||||
this(term, frequency, 0);
|
||||
}
|
||||
|
||||
public Term(String term, long frequency, long totalTermFreq) {
|
||||
this.term = term;
|
||||
this.frequency = frequency;
|
||||
this.totalTermFreq = totalTermFreq;
|
||||
}
|
||||
|
||||
public String getTerm() {
|
||||
|
@ -84,5 +99,17 @@ public class TermsResponse {
|
|||
public void addFrequency(long frequency) {
|
||||
this.frequency += frequency;
|
||||
}
|
||||
|
||||
public long getTotalTermFreq() {
|
||||
return totalTermFreq;
|
||||
}
|
||||
|
||||
public void setTotalTermFreq(long totalTermFreq) {
|
||||
this.totalTermFreq = totalTermFreq;
|
||||
}
|
||||
|
||||
public void addTotalTermFreq(long totalTermFreq) {
|
||||
this.totalTermFreq += totalTermFreq;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -42,16 +42,19 @@ public interface TermsParams {
|
|||
|
||||
/**
|
||||
* Optional. The list of terms to be retrieved.
|
||||
*
|
||||
*/
|
||||
public static final String TERMS_LIST = TERMS_PREFIX + "list";
|
||||
|
||||
/**
|
||||
* Optional. The list of terms to be retrieved.
|
||||
*
|
||||
* Optional. If true, also returns index-level statistics, such as numDocs.
|
||||
*/
|
||||
public static final String TERMS_STATS = TERMS_PREFIX + "stats";
|
||||
|
||||
/**
|
||||
* Optional. If true, also returns terms' total term frequency.
|
||||
*/
|
||||
public static final String TERMS_TTF = TERMS_PREFIX + "ttf";
|
||||
|
||||
/**
|
||||
* Optional. The lower bound term to start at. The TermEnum will start at the next term after this term in the dictionary.
|
||||
*
|
||||
|
@ -111,6 +114,7 @@ public interface TermsParams {
|
|||
* Optional. The minimum value of docFreq to be returned. 1 by default
|
||||
*/
|
||||
public static final String TERMS_MINCOUNT = TERMS_PREFIX + "mincount";
|
||||
|
||||
/**
|
||||
* Optional. The maximum value of docFreq to be returned. -1 by default means no boundary
|
||||
*/
|
||||
|
|
Loading…
Reference in New Issue