SOLR-651: Added in TermVectorComponent

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@707399 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2008-10-23 15:49:18 +00:00
parent 1e5562a910
commit 153a59999e
7 changed files with 601 additions and 2 deletions

View File

@ -70,6 +70,9 @@ New Features
10. SOLR-746: Added "omitHeader" request parameter to omit the header from the response.
(Noble Paul via shalin)
11. SOLR-651: Added TermVectorComponent for serving up term vector information, plus IDF.
See http://wiki.apache.org/solr/TermVectorComponent (gsingers, Vaijanath N. Rao, Noble Paul)
Optimizations
----------------------
1. SOLR-374: Use IndexReader.reopen to save resources by re-using parts of the

View File

@ -281,6 +281,8 @@
termVectors: [false] set to true to store the term vector for a given field.
When using MoreLikeThis, fields used for similarity should be stored for
best performance.
termPositions: Store position information with the term vector. This will increase storage costs.
termOffsets: Store offset information with the term vector. This will increase storage costs.
-->
<field name="id" type="string" indexed="true" stored="true" required="true" />
@ -290,7 +292,7 @@
<field name="alphaNameSort" type="alphaOnlySort" indexed="true" stored="false"/>
<field name="manu" type="text" indexed="true" stored="true" omitNorms="true"/>
<field name="cat" type="text_ws" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
<field name="features" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="features" type="text" indexed="true" stored="true" multiValued="true" termVectors="true" termPositions="true" termOffsets="true"/>
<field name="includes" type="text" indexed="true" stored="true"/>
<field name="weight" type="sfloat" indexed="true" stored="true"/>

View File

@ -574,7 +574,8 @@
</lst>
</searchComponent>
<!-- a request handler utilizing the spellcheck component -->
<!-- a request handler utilizing the spellcheck component. This is purely as an example.
You will likely want to add the component to your already specified request handlers. -->
<requestHandler name="/spellCheckCompRH" class="solr.SearchHandler">
<lst name="defaults">
<!-- omp = Only More Popular -->
@ -589,6 +590,19 @@
</arr>
</requestHandler>
<searchComponent name="tvComponent" class="org.apache.solr.handler.component.TermVectorComponent"/>
<!-- A Req Handler for working with the tvComponent. This is purely as an example.
You will likely want to add the component to your already specified request handlers. -->
<requestHandler name="tvrh" class="org.apache.solr.handler.component.SearchHandler">
<lst name="defaults">
<bool name="tv">true</bool>
</lst>
<arr name="last-components">
<str>tvComponent</str>
</arr>
</requestHandler>
<!-- a search component that enables you to configure the top results for
a given query regardless of the normal lucene scoring.-->
<searchComponent name="elevator" class="solr.QueryElevationComponent" >

View File

@ -0,0 +1,50 @@
package org.apache.solr.common.params;
/**
*
*
**/
public interface TermVectorParams {
public static final String TV_PREFIX = "tv.";
/**
* Return Term Frequency info
* */
public static final String TF = TV_PREFIX + "tf";
/**
* Return Term Vector position information
*
* */
public static final String POSITIONS = TV_PREFIX + "positions";
/**
* Return offset information, if available
* */
public static final String OFFSETS = TV_PREFIX + "offsets";
/**
* Return IDF information. May be expensive
* */
public static final String IDF = TV_PREFIX + "idf";
/**
* Return TF-IDF calculation, i.e. (tf / idf). May be expensive.
*/
public static final String TF_IDF = TV_PREFIX + "tf-idf";
/**
* Return all the options: TF, positions, offsets, idf
*/
public static final String ALL = TV_PREFIX + "all";
/**
* The fields to get term vectors for
*/
public static final String FIELDS = TV_PREFIX + "fl";
/**
* The Doc Ids (Lucene internal ids) of the docs to get the term vectors for
*/
public static final String DOC_IDS = TV_PREFIX + "docIds";
}

View File

@ -0,0 +1,307 @@
package org.apache.solr.handler.component;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.SetBasedFieldSelector;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermVectorMapper;
import org.apache.lucene.index.TermVectorOffsetInfo;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.TermVectorParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.search.DocList;
import org.apache.solr.search.DocListAndSet;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.RefCounted;
import org.apache.solr.util.plugin.SolrCoreAware;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Logger;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Return term vectors for the documents in a query result set.
* <p/>
* Info available:
* term, frequency, position, offset, IDF.
* <p/>
* <b>Note</b> Returning IDF can be expensive.
*/
public class TermVectorComponent extends SearchComponent implements SolrCoreAware {
private transient static Logger log = Logger.getLogger(TermVectorComponent.class.getName());
public static final String COMPONENT_NAME = "tv";
protected NamedList initParams;
public static final String TERM_VECTORS = "termVectors";
public void process(ResponseBuilder rb) throws IOException {
SolrParams params = rb.req.getParams();
if (!params.getBool(COMPONENT_NAME, false)) {
return;
}
NamedList termVectors = new NamedList();
rb.rsp.add(TERM_VECTORS, termVectors);
//figure out what options we have, and try to get the appropriate vector
boolean termFreq = params.getBool(TermVectorParams.TF, false);
boolean positions = params.getBool(TermVectorParams.POSITIONS, false);
boolean offsets = params.getBool(TermVectorParams.OFFSETS, false);
boolean idf = params.getBool(TermVectorParams.IDF, false);
boolean tfIdf = params.getBool(TermVectorParams.TF_IDF, false);
//boolean cacheIdf = params.getBool(TermVectorParams.IDF, false);
boolean all = params.getBool(TermVectorParams.ALL, false);
if (all == true){
termFreq = true;
positions = true;
offsets = true;
idf = true;
tfIdf = true;
}
String[] fields = params.getParams(TermVectorParams.FIELDS);
if (fields == null) {
fields = params.getParams(CommonParams.FL);
}
DocListAndSet listAndSet = rb.getResults();
List<Integer> docIds = getInts(params.getParams(TermVectorParams.DOC_IDS));
Iterator<Integer> iter;
if (docIds != null && docIds.isEmpty() == false) {
iter = docIds.iterator();
} else {
DocList list = listAndSet.docList;
iter = list.iterator();
}
SolrCore core = rb.req.getCore();
RefCounted<SolrIndexSearcher> searcher = core.getSearcher();
try {
IndexReader reader = searcher.get().getReader();
TVMapper mapper = new TVMapper(fields, reader, termFreq, positions, offsets, idf, tfIdf);
IndexSchema schema = core.getSchema();
String uniqFieldName = schema.getUniqueKeyField().getName();
SetBasedFieldSelector fieldSelector = new SetBasedFieldSelector(Collections.singleton(uniqFieldName), Collections.emptySet());
while (iter.hasNext()) {
Integer docId = iter.next();
NamedList docNL = new NamedList();
termVectors.add("doc-" + docId, docNL);
mapper.docNL = docNL;
Document document = reader.document(docId, fieldSelector);
String uniqId = document.get(uniqFieldName);
docNL.add("uniqueKey", uniqId);
reader.getTermFreqVector(docId, mapper);
}
termVectors.add("uniqueKeyFieldName", uniqFieldName);
} finally {
searcher.decref();
}
}
private List<Integer> getInts(String[] vals) {
List<Integer> result = null;
if (vals != null && vals.length > 0) {
result = new ArrayList<Integer>(vals.length);
for (int i = 0; i < vals.length; i++) {
try {
result.add(new Integer(vals[i]));
} catch (NumberFormatException e) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e.getMessage(), e);
}
}
}
return result;
}
@Override
public int distributedProcess(ResponseBuilder rb) throws IOException {
int result = ResponseBuilder.STAGE_DONE;
if (rb.stage == ResponseBuilder.STAGE_GET_FIELDS) {
//Go ask each shard for it's vectors
// for each shard, collect the documents for that shard.
HashMap<String, Collection<ShardDoc>> shardMap = new HashMap<String, Collection<ShardDoc>>();
for (ShardDoc sdoc : rb.resultIds.values()) {
Collection<ShardDoc> shardDocs = shardMap.get(sdoc.shard);
if (shardDocs == null) {
shardDocs = new ArrayList<ShardDoc>();
shardMap.put(sdoc.shard, shardDocs);
}
shardDocs.add(sdoc);
}
// Now create a request for each shard to retrieve the stored fields
for (Collection<ShardDoc> shardDocs : shardMap.values()) {
ShardRequest sreq = new ShardRequest();
sreq.purpose = ShardRequest.PURPOSE_GET_FIELDS;
sreq.shards = new String[]{shardDocs.iterator().next().shard};
sreq.params = new ModifiableSolrParams();
// add original params
sreq.params.add(rb.req.getParams());
sreq.params.remove(CommonParams.Q);//remove the query
ArrayList<String> ids = new ArrayList<String>(shardDocs.size());
for (ShardDoc shardDoc : shardDocs) {
ids.add(shardDoc.id.toString());
}
sreq.params.add(TermVectorParams.DOC_IDS, StrUtils.join(ids, ','));
rb.addRequest(this, sreq);
}
result = ResponseBuilder.STAGE_DONE;
}
return result;
}
private class TVMapper extends TermVectorMapper {
private NamedList docNL;
private IndexReader reader;
private Set<String> fields;
private boolean termFreq, positions, offsets, idf, tfIdf;
//internal vars not passed in by construction
private boolean map, useOffsets, usePositions;
//private Map<String, Integer> idfCache;
private NamedList fieldNL;
private Term currentTerm;
public TVMapper(String[] fields, IndexReader reader, boolean termFreq, boolean positions, boolean offsets, boolean idf, boolean tfIdf) {
this.reader = reader;
this.fields = fields != null ? new HashSet<String>(Arrays.asList(fields)) : Collections.<String>emptySet();
this.termFreq = termFreq;
this.positions = positions;
this.offsets = offsets;
this.idf = idf;
this.tfIdf = tfIdf;
}
public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
if (map == true && fieldNL != null) {
NamedList termInfo = new NamedList();
fieldNL.add(term, termInfo);
if (termFreq == true) {
termInfo.add("freq", frequency);
}
if (useOffsets == true) {
NamedList theOffsets = new NamedList();
termInfo.add("offsets", theOffsets);
for (int i = 0; i < offsets.length; i++) {
TermVectorOffsetInfo offset = offsets[i];
theOffsets.add("start", offset.getStartOffset());
theOffsets.add("end", offset.getEndOffset());
}
}
if (usePositions == true) {
NamedList positionsNL = new NamedList();
for (int i = 0; i < positions.length; i++) {
positionsNL.add("position", positions[i]);
}
termInfo.add("positions", positionsNL);
}
if (idf == true) {
termInfo.add("idf", getIdf(term));
}
if (tfIdf == true){
double tfIdfVal = ((double) frequency) / getIdf(term);
termInfo.add("tf-idf", tfIdfVal);
}
}
}
private int getIdf(String term) {
int result = 1;
currentTerm = currentTerm.createTerm(term);
try {
TermEnum termEnum = reader.terms(currentTerm);
if (termEnum != null && termEnum.term().equals(currentTerm)) {
result = termEnum.docFreq();
}
} catch (IOException e) {
throw new RuntimeException(e);
}
return result;
}
public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
if (idf == true && reader != null) {
this.currentTerm = new Term(field);
}
useOffsets = storeOffsets && offsets;
usePositions = storePositions && positions;
if (fields.isEmpty() || fields.contains(field)) {
map = true;
fieldNL = new NamedList();
docNL.add(field, fieldNL);
} else {
map = false;
fieldNL = null;
}
}
}
public void prepare(ResponseBuilder rb) throws IOException {
}
//////////////////////// NamedListInitializedPlugin methods //////////////////////
@Override
public void init(NamedList args) {
super.init(args);
this.initParams = args;
}
public void inform(SolrCore core) {
}
public String getVersion() {
return "$Revision$";
}
public String getSourceId() {
return "$Id:$";
}
public String getSource() {
return "$Revision:$";
}
public String getDescription() {
return "A Component for working with Term Vectors";
}
}

View File

@ -0,0 +1,211 @@
package org.apache.solr.handler.component;
import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.solr.core.SolrCore;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.TermVectorParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.request.SolrRequestHandler;
import org.apache.solr.request.SolrQueryResponse;
import org.apache.solr.request.LocalSolrQueryRequest;
import java.util.HashMap;
import java.util.ArrayList;
import java.util.Arrays;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
*
*
**/
public class TermVectorComponentTest extends AbstractSolrTestCase {
@Override
public String getSchemaFile() {
return "schema.xml";
}
@Override
public String getSolrConfigFile() {
return "solrconfig.xml";
}
@Override
public void setUp() throws Exception {
super.setUp();
assertU(adoc("id", "0", "test_posofftv", "This is a title and another title"));
assertU(adoc("id", "1", "test_posofftv",
"The quick reb fox jumped over the lazy brown dogs."));
assertU(adoc("id", "2", "test_posofftv", "This is a document"));
assertU(adoc("id", "3", "test_posofftv", "another document"));
//bunch of docs that are variants on blue
assertU(adoc("id", "4", "test_posofftv", "blue"));
assertU(adoc("id", "5", "test_posofftv", "blud"));
assertU(adoc("id", "6", "test_posofftv", "boue"));
assertU(adoc("id", "7", "test_posofftv", "glue"));
assertU(adoc("id", "8", "test_posofftv", "blee"));
assertU(adoc("id", "9", "test_posofftv", "blah"));
assertU("commit", commit());
}
public void testBasics() throws Exception {
SolrCore core = h.getCore();
SearchComponent tvComp = core.getSearchComponent("tvComponent");
assertTrue("tvComp is null and it shouldn't be", tvComp != null);
ModifiableSolrParams params = new ModifiableSolrParams();
params.add(CommonParams.Q, "id:0");
params.add(CommonParams.QT, "tvrh");
params.add(TermVectorParams.TF, "true");
params.add(TermVectorComponent.COMPONENT_NAME, "true");
SolrRequestHandler handler = core.getRequestHandler("tvrh");
SolrQueryResponse rsp;
rsp = new SolrQueryResponse();
rsp.add("responseHeader", new SimpleOrderedMap());
handler.handleRequest(new LocalSolrQueryRequest(core, params), rsp);
NamedList values = rsp.getValues();
NamedList termVectors = (NamedList) values.get(TermVectorComponent.TERM_VECTORS);
assertTrue("termVectors is null and it shouldn't be", termVectors != null);
System.out.println("TVs:" + termVectors);
NamedList doc = (NamedList) termVectors.getVal(0);
assertTrue("doc is null and it shouldn't be", doc != null);
assertTrue(doc.size() + " does not equal: " + 2, doc.size() == 2);
NamedList field = (NamedList) doc.get("test_posofftv");
assertTrue("field is null and it shouldn't be", field != null);
assertTrue(field.size() + " does not equal: " + 2, field.size() == 2);
NamedList titl = (NamedList) field.get("titl");
assertTrue("titl is null and it shouldn't be", titl != null);
assertTrue(titl.get("freq") + " does not equal: " + 2, ((Integer) titl.get("freq")) == 2);
String uniqueKeyFieldName = (String) termVectors.getVal(1);
assertTrue("uniqueKeyFieldName is null and it shouldn't be", uniqueKeyFieldName != null);
assertTrue(uniqueKeyFieldName + " is not equal to " + "id", uniqueKeyFieldName.equals("id") == true);
}
public void testOptions() throws Exception {
SolrCore core = h.getCore();
SearchComponent tvComp = core.getSearchComponent("tvComponent");
assertTrue("tvComp is null and it shouldn't be", tvComp != null);
ModifiableSolrParams params = new ModifiableSolrParams();
params.add(CommonParams.Q, "id:0");
params.add(CommonParams.QT, "tvrh");
params.add(TermVectorParams.TF, "true");
params.add(TermVectorParams.IDF, "true");
params.add(TermVectorParams.OFFSETS, "true");
params.add(TermVectorParams.POSITIONS, "true");
params.add(TermVectorParams.TF_IDF, "true");
params.add(TermVectorComponent.COMPONENT_NAME, "true");
SolrRequestHandler handler = core.getRequestHandler("tvrh");
SolrQueryResponse rsp;
rsp = new SolrQueryResponse();
rsp.add("responseHeader", new SimpleOrderedMap());
handler.handleRequest(new LocalSolrQueryRequest(core, params), rsp);
NamedList values = rsp.getValues();
NamedList termVectors = (NamedList) values.get(TermVectorComponent.TERM_VECTORS);
assertTrue("termVectors is null and it shouldn't be", termVectors != null);
System.out.println("TVs: " + termVectors);
NamedList doc = (NamedList) termVectors.getVal(0);
assertTrue("doc is null and it shouldn't be", doc != null);
assertTrue(doc.size() + " does not equal: " + 2, doc.size() == 2);
}
public void testNoFields() throws Exception {
SolrCore core = h.getCore();
SearchComponent tvComp = core.getSearchComponent("tvComponent");
assertTrue("tvComp is null and it shouldn't be", tvComp != null);
ModifiableSolrParams params = new ModifiableSolrParams();
params.add(CommonParams.Q, "id:0");
params.add(CommonParams.QT, "tvrh");
params.add(TermVectorParams.TF, "true");
//Pass in a field that doesn't exist on the doc, thus, no vectors should be returned
params.add(TermVectorParams.FIELDS, "foo");
params.add(TermVectorComponent.COMPONENT_NAME, "true");
SolrRequestHandler handler = core.getRequestHandler("tvrh");
SolrQueryResponse rsp;
rsp = new SolrQueryResponse();
rsp.add("responseHeader", new SimpleOrderedMap());
handler.handleRequest(new LocalSolrQueryRequest(core, params), rsp);
NamedList values = rsp.getValues();
NamedList termVectors = (NamedList) values.get(TermVectorComponent.TERM_VECTORS);
assertTrue("termVectors is null and it shouldn't be", termVectors != null);
NamedList doc = (NamedList) termVectors.getVal(0);
assertTrue("doc is null and it shouldn't be", doc != null);
assertTrue(doc.size() + " does not equal: " + 1, doc.size() == 1);
}
public void testDistributed() throws Exception {
SolrCore core = h.getCore();
TermVectorComponent tvComp = (TermVectorComponent) core.getSearchComponent("tvComponent");
assertTrue("tvComp is null and it shouldn't be", tvComp != null);
ModifiableSolrParams params = new ModifiableSolrParams();
ResponseBuilder rb = new ResponseBuilder();
rb.stage = ResponseBuilder.STAGE_GET_FIELDS;
rb.shards = new String[]{"localhost:0", "localhost:1", "localhost:2", "localhost:3"};//we don't actually call these, since we are going to invoke distributedProcess directly
rb.resultIds = new HashMap<Object, ShardDoc>();
rb.components = new ArrayList<SearchComponent>();
rb.components.add(tvComp);
params.add(CommonParams.Q, "id:0");
params.add(CommonParams.QT, "tvrh");
params.add(TermVectorParams.TF, "true");
params.add(TermVectorParams.IDF, "true");
params.add(TermVectorParams.OFFSETS, "true");
params.add(TermVectorParams.POSITIONS, "true");
params.add(TermVectorComponent.COMPONENT_NAME, "true");
rb.req = new LocalSolrQueryRequest(core, params);
rb.outgoing = new ArrayList<ShardRequest>();
//one doc per shard, but make sure there are enough docs to go around
for (int i = 0; i < rb.shards.length; i++){
ShardDoc doc = new ShardDoc();
doc.id = i; //must be a valid doc that was indexed.
doc.score = 1 - (i / (float)rb.shards.length);
doc.positionInResponse = i;
doc.shard = rb.shards[i];
doc.orderInShard = 0;
rb.resultIds.put(doc.id, doc);
}
int result = tvComp.distributedProcess(rb);
assertTrue(result + " does not equal: " + ResponseBuilder.STAGE_DONE, result == ResponseBuilder.STAGE_DONE);
//one outgoing per shard
assertTrue("rb.outgoing Size: " + rb.outgoing.size() + " is not: " + rb.shards.length, rb.outgoing.size() == rb.shards.length);
for (ShardRequest request : rb.outgoing) {
ModifiableSolrParams solrParams = request.params;
System.out.println("Shard: " + Arrays.asList(request.shards) + " Params: " + solrParams);
}
}
}
/*
* <field name="test_basictv" type="text" termVectors="true"/>
<field name="test_notv" type="text" termVectors="false"/>
<field name="test_postv" type="text" termVectors="true" termPositions="true"/>
<field name="test_offtv" type="text" termVectors="true" termOffsets="true"/>
<field name="test_posofftv" type="text" termVectors="true"
termPositions="true" termOffsets="true"/>
*
* */

View File

@ -369,6 +369,18 @@
</arr>
</requestHandler>
<searchComponent name="tvComponent" class="org.apache.solr.handler.component.TermVectorComponent"/>
<requestHandler name="tvrh" class="org.apache.solr.handler.component.SearchHandler">
<lst name="defaults">
</lst>
<arr name="last-components">
<str>tvComponent</str>
</arr>
</requestHandler>
<highlighting>
<!-- Configure the standard fragmenter -->
<fragmenter name="gap" class="org.apache.solr.highlight.GapFragmenter" default="true">