SOLR-9243:Add terms.list parameter to the TermsComponent to fetch the docFreq for a list of terms

This commit is contained in:
jbernste 2016-06-23 10:04:43 -04:00
parent 1e1dc91bbf
commit 1427f4b2e7
4 changed files with 133 additions and 15 deletions

View File

@ -28,6 +28,7 @@ import org.apache.solr.common.util.StrUtils;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.StrField;
import org.apache.solr.request.SimpleFacets.CountPair;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.BoundedTreeSet;
import org.apache.solr.client.solrj.response.TermsResponse;
@ -92,6 +93,12 @@ public class TermsComponent extends SearchComponent {
if (fields == null || fields.length==0) return;
String termList = params.get(TermsParams.TERMS_LIST);
if(termList != null) {
fetchTerms(rb.req.getSearcher(), fields, termList, termsResult);
return;
}
int limit = params.getInt(TermsParams.TERMS_LIMIT, 10);
if (limit < 0) {
limit = Integer.MAX_VALUE;
@ -377,8 +384,12 @@ public class TermsComponent extends SearchComponent {
NamedList<Object> response = new SimpleOrderedMap<>();
// determine if we are going index or count sort
boolean sort = !TermsParams.TERMS_SORT_INDEX.equals(params.get(
TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_COUNT));
boolean sort = !TermsParams.TERMS_SORT_INDEX.equals(params.get(TermsParams.TERMS_SORT,
TermsParams.TERMS_SORT_COUNT));
if(params.get(TermsParams.TERMS_LIST) != null) {
//Always use lexical sort when TERM_LIST is provided
sort = false;
}
// init minimum frequency
long freqmin = 1;
@ -466,6 +477,76 @@ public class TermsComponent extends SearchComponent {
}
}
private void fetchTerms(SolrIndexSearcher indexSearcher,
String[] fields,
String termList,
NamedList result) throws IOException {
NamedList termsMap = new SimpleOrderedMap();
List<LeafReaderContext> leaves = indexSearcher.getTopReaderContext().leaves();
String field = fields[0];
FieldType fieldType = indexSearcher.getSchema().getField(field).getType();
String[] splitTerms = termList.split(",");
for(int i=0; i<splitTerms.length; i++) {
splitTerms[i] = splitTerms[i].trim();
}
Term[] terms = new Term[splitTerms.length];
TermContext[] termContexts = new TermContext[terms.length];
for(int i=0; i<splitTerms.length; i++) {
terms[i] = new Term(field, fieldType.readableToIndexed(splitTerms[i]));
}
Arrays.sort(terms);
collectTermContext(indexSearcher.getTopReaderContext().reader(), leaves, termContexts, terms);
for(int i=0; i<terms.length; i++) {
if(termContexts[i] != null) {
String outTerm = fieldType.indexedToReadable(terms[i].bytes().utf8ToString());
int docFreq = termContexts[i].docFreq();
termsMap.add(outTerm, docFreq);
}
}
result.add(field, termsMap);
}
private void collectTermContext(IndexReader reader,
List<LeafReaderContext> leaves, TermContext[] contextArray,
Term[] queryTerms) throws IOException {
TermsEnum termsEnum = null;
for (LeafReaderContext context : leaves) {
final Fields fields = context.reader().fields();
for (int i = 0; i < queryTerms.length; i++) {
Term term = queryTerms[i];
TermContext termContext = contextArray[i];
final Terms terms = fields.terms(term.field());
if (terms == null) {
// field does not exist
continue;
}
termsEnum = terms.iterator();
assert termsEnum != null;
if (termsEnum == TermsEnum.EMPTY) continue;
if (termsEnum.seekExact(term.bytes())) {
if (termContext == null) {
contextArray[i] = new TermContext(reader.getContext(),
termsEnum.termState(), context.ord, termsEnum.docFreq(),
termsEnum.totalTermFreq());
} else {
termContext.register(termsEnum.termState(), context.ord,
termsEnum.docFreq(), termsEnum.totalTermFreq());
}
}
}
}
}
@Override
public String getDescription() {
return "A Component for working with Term Enumerators";

View File

@ -30,10 +30,10 @@ public class DistributedTermsComponentTest extends BaseDistributedSearchTestCase
@Test
public void test() throws Exception {
del("*:*");
index(id, 18, "b_t", "snake spider shark snail slug seal");
index(id, 19, "b_t", "snake spider shark snail slug");
index(id, 20, "b_t", "snake spider shark snail");
index(id, 21, "b_t", "snake spider shark");
index(id, 18, "b_t", "snake spider shark snail slug seal", "foo_i", "1");
index(id, 19, "b_t", "snake spider shark snail slug", "foo_i", "2");
index(id, 20, "b_t", "snake spider shark snail", "foo_i", "3");
index(id, 21, "b_t", "snake spider shark", "foo_i", "2");
index(id, 22, "b_t", "snake spider");
index(id, 23, "b_t", "snake");
index(id, 24, "b_t", "ant zebra");
@ -49,5 +49,8 @@ public class DistributedTermsComponentTest extends BaseDistributedSearchTestCase
query("qt", "/terms", "shards.qt", "/terms", "terms.limit", 5, "terms", "true", "terms.fl", "b_t", "terms.prefix", "s", "terms.lower", "s", "terms.sort", "index");
query("qt", "/terms", "shards.qt", "/terms", "terms.limit", 5, "terms", "true", "terms.fl", "b_t", "terms.prefix", "s", "terms.lower", "s", "terms.upper", "sn", "terms.sort", "index");
query("qt", "/terms", "shards.qt", "/terms", "terms", "true", "terms.fl", "b_t", "terms.sort", "index");
query("qt", "/terms", "shards.qt", "/terms", "terms", "true", "terms.fl", "b_t", "terms.list", "snake, zebra, ant, bad");
query("qt", "/terms", "shards.qt", "/terms", "terms", "true", "terms.fl", "foo_i", "terms.list", "2, 3, 1");
}
}

View File

@ -32,9 +32,9 @@ public class TermsComponentTest extends SolrTestCaseJ4 {
@BeforeClass
public static void beforeTest() throws Exception {
System.setProperty("enable.update.log", "false"); // schema12 doesn't support _version_
initCore("solrconfig.xml","schema12.xml");
initCore("solrconfig.xml", "schema12.xml");
assertNull(h.validateUpdate(adoc("id", "0", "lowerfilt", "a", "standardfilt", "a", "foo_i","1")));
assertNull(h.validateUpdate(adoc("id", "0", "lowerfilt", "a", "standardfilt", "a", "foo_i", "1")));
assertNull(h.validateUpdate(adoc("id", "1", "lowerfilt", "a", "standardfilt", "aa", "foo_i","1")));
assertNull(h.validateUpdate(adoc("id", "2", "lowerfilt", "aa", "standardfilt", "aaa", "foo_i","2")));
assertNull(h.validateUpdate(adoc("id", "3", "lowerfilt", "aaa", "standardfilt", "abbb")));
@ -45,7 +45,10 @@ public class TermsComponentTest extends SolrTestCaseJ4 {
assertNull(h.validateUpdate(adoc("id", "8", "lowerfilt", "baa", "standardfilt", "cccc")));
assertNull(h.validateUpdate(adoc("id", "9", "lowerfilt", "bbb", "standardfilt", "ccccc")));
assertNull(h.validateUpdate(adoc("id", "10", "standardfilt", "ddddd")));
assertNull(h.validateUpdate(commit()));
assertNull(h.validateUpdate(adoc("id", "11", "standardfilt", "ddddd")));
assertNull(h.validateUpdate(adoc("id", "12", "standardfilt", "ddddd")));
assertNull(h.validateUpdate(adoc("id", "13", "standardfilt", "ddddd")));
@ -53,6 +56,8 @@ public class TermsComponentTest extends SolrTestCaseJ4 {
assertNull(h.validateUpdate(adoc("id", "15", "standardfilt", "d")));
assertNull(h.validateUpdate(adoc("id", "16", "standardfilt", "d")));
assertNull(h.validateUpdate(commit()));
assertNull(h.validateUpdate(adoc("id", "17", "standardfilt", "snake")));
assertNull(h.validateUpdate(adoc("id", "18", "standardfilt", "spider")));
assertNull(h.validateUpdate(adoc("id", "19", "standardfilt", "shark")));
@ -137,13 +142,13 @@ public class TermsComponentTest extends SolrTestCaseJ4 {
@Test
public void testRegexpWithFlags() throws Exception {
// TODO: there are no uppercase or mixed-case terms in the index!
assertQ(req("indent","true", "qt","/terms", "terms","true",
"terms.fl","standardfilt",
"terms.lower","a", "terms.lower.incl","false",
"terms.upper","c", "terms.upper.incl","true",
"terms.regex","B.*",
"terms.regex.flag","case_insensitive")
,"count(//lst[@name='standardfilt']/*)=3"
assertQ(req("indent", "true", "qt", "/terms", "terms", "true",
"terms.fl", "standardfilt",
"terms.lower", "a", "terms.lower.incl", "false",
"terms.upper", "c", "terms.upper.incl", "true",
"terms.regex", "B.*",
"terms.regex.flag", "case_insensitive")
, "count(//lst[@name='standardfilt']/*)=3"
);
}
@ -162,6 +167,29 @@ public class TermsComponentTest extends SolrTestCaseJ4 {
}
@Test
public void testTermsList() throws Exception {
//Terms list always returns in index order
assertQ(req("indent","true", "qt","/terms", "terms","true",
"terms.fl","standardfilt",
"terms.list","spider, snake, shark, ddddd, bad")
,"count(//lst[@name='standardfilt']/*)=4"
,"//lst[@name='standardfilt']/int[1][@name='ddddd'][.='4']"
,"//lst[@name='standardfilt']/int[2][@name='shark'][.='2']"
,"//lst[@name='standardfilt']/int[3][@name='snake'][.='3']"
,"//lst[@name='standardfilt']/int[4][@name='spider'][.='1']"
);
//Test with numeric terms
assertQ(req("indent","true", "qt","/terms", "terms","true",
"terms.fl","foo_i",
"terms.list","2, 1")
,"count(//lst[@name='foo_i']/*)=2"
,"//lst[@name='foo_i']/int[1][@name='1'][.='2']"
,"//lst[@name='foo_i']/int[2][@name='2'][.='1']"
);
}
@Test
public void testSortIndex() throws Exception {
assertQ(req("indent","true", "qt","/terms", "terms","true",

View File

@ -38,6 +38,12 @@ public interface TermsParams {
*/
public static final String TERMS_FIELD = TERMS_PREFIX + "fl";
/**
* Optional. The list of terms to be retrieved.
*
*/
public static final String TERMS_LIST = TERMS_PREFIX + "list";
/**
* Optional. The lower bound term to start at. The TermEnum will start at the next term after this term in the dictionary.
*