mirror of https://github.com/apache/lucene.git
SOLR-651: Added in TermVectorComponent
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@707399 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1e5562a910
commit
153a59999e
|
@ -70,6 +70,9 @@ New Features
|
||||||
10. SOLR-746: Added "omitHeader" request parameter to omit the header from the response.
|
10. SOLR-746: Added "omitHeader" request parameter to omit the header from the response.
|
||||||
(Noble Paul via shalin)
|
(Noble Paul via shalin)
|
||||||
|
|
||||||
|
11. SOLR-651: Added TermVectorComponent for serving up term vector information, plus IDF.
|
||||||
|
See http://wiki.apache.org/solr/TermVectorComponent (gsingers, Vaijanath N. Rao, Noble Paul)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
----------------------
|
----------------------
|
||||||
1. SOLR-374: Use IndexReader.reopen to save resources by re-using parts of the
|
1. SOLR-374: Use IndexReader.reopen to save resources by re-using parts of the
|
||||||
|
|
|
@ -281,6 +281,8 @@
|
||||||
termVectors: [false] set to true to store the term vector for a given field.
|
termVectors: [false] set to true to store the term vector for a given field.
|
||||||
When using MoreLikeThis, fields used for similarity should be stored for
|
When using MoreLikeThis, fields used for similarity should be stored for
|
||||||
best performance.
|
best performance.
|
||||||
|
termPositions: Store position information with the term vector. This will increase storage costs.
|
||||||
|
termOffsets: Store offset information with the term vector. This will increase storage costs.
|
||||||
-->
|
-->
|
||||||
|
|
||||||
<field name="id" type="string" indexed="true" stored="true" required="true" />
|
<field name="id" type="string" indexed="true" stored="true" required="true" />
|
||||||
|
@ -290,7 +292,7 @@
|
||||||
<field name="alphaNameSort" type="alphaOnlySort" indexed="true" stored="false"/>
|
<field name="alphaNameSort" type="alphaOnlySort" indexed="true" stored="false"/>
|
||||||
<field name="manu" type="text" indexed="true" stored="true" omitNorms="true"/>
|
<field name="manu" type="text" indexed="true" stored="true" omitNorms="true"/>
|
||||||
<field name="cat" type="text_ws" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
|
<field name="cat" type="text_ws" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
|
||||||
<field name="features" type="text" indexed="true" stored="true" multiValued="true"/>
|
<field name="features" type="text" indexed="true" stored="true" multiValued="true" termVectors="true" termPositions="true" termOffsets="true"/>
|
||||||
<field name="includes" type="text" indexed="true" stored="true"/>
|
<field name="includes" type="text" indexed="true" stored="true"/>
|
||||||
|
|
||||||
<field name="weight" type="sfloat" indexed="true" stored="true"/>
|
<field name="weight" type="sfloat" indexed="true" stored="true"/>
|
||||||
|
|
|
@ -574,7 +574,8 @@
|
||||||
</lst>
|
</lst>
|
||||||
</searchComponent>
|
</searchComponent>
|
||||||
|
|
||||||
<!-- a request handler utilizing the spellcheck component -->
|
<!-- a request handler utilizing the spellcheck component. This is purely as an example.
|
||||||
|
You will likely want to add the component to your already specified request handlers. -->
|
||||||
<requestHandler name="/spellCheckCompRH" class="solr.SearchHandler">
|
<requestHandler name="/spellCheckCompRH" class="solr.SearchHandler">
|
||||||
<lst name="defaults">
|
<lst name="defaults">
|
||||||
<!-- omp = Only More Popular -->
|
<!-- omp = Only More Popular -->
|
||||||
|
@ -589,6 +590,19 @@
|
||||||
</arr>
|
</arr>
|
||||||
</requestHandler>
|
</requestHandler>
|
||||||
|
|
||||||
|
<searchComponent name="tvComponent" class="org.apache.solr.handler.component.TermVectorComponent"/>
|
||||||
|
<!-- A Req Handler for working with the tvComponent. This is purely as an example.
|
||||||
|
You will likely want to add the component to your already specified request handlers. -->
|
||||||
|
<requestHandler name="tvrh" class="org.apache.solr.handler.component.SearchHandler">
|
||||||
|
<lst name="defaults">
|
||||||
|
<bool name="tv">true</bool>
|
||||||
|
</lst>
|
||||||
|
<arr name="last-components">
|
||||||
|
<str>tvComponent</str>
|
||||||
|
</arr>
|
||||||
|
</requestHandler>
|
||||||
|
|
||||||
|
|
||||||
<!-- a search component that enables you to configure the top results for
|
<!-- a search component that enables you to configure the top results for
|
||||||
a given query regardless of the normal lucene scoring.-->
|
a given query regardless of the normal lucene scoring.-->
|
||||||
<searchComponent name="elevator" class="solr.QueryElevationComponent" >
|
<searchComponent name="elevator" class="solr.QueryElevationComponent" >
|
||||||
|
|
|
@ -0,0 +1,50 @@
|
||||||
|
package org.apache.solr.common.params;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
*
|
||||||
|
**/
|
||||||
|
public interface TermVectorParams {
|
||||||
|
|
||||||
|
public static final String TV_PREFIX = "tv.";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return Term Frequency info
|
||||||
|
* */
|
||||||
|
public static final String TF = TV_PREFIX + "tf";
|
||||||
|
/**
|
||||||
|
* Return Term Vector position information
|
||||||
|
*
|
||||||
|
* */
|
||||||
|
public static final String POSITIONS = TV_PREFIX + "positions";
|
||||||
|
/**
|
||||||
|
* Return offset information, if available
|
||||||
|
* */
|
||||||
|
public static final String OFFSETS = TV_PREFIX + "offsets";
|
||||||
|
/**
|
||||||
|
* Return IDF information. May be expensive
|
||||||
|
* */
|
||||||
|
public static final String IDF = TV_PREFIX + "idf";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return TF-IDF calculation, i.e. (tf / idf). May be expensive.
|
||||||
|
*/
|
||||||
|
public static final String TF_IDF = TV_PREFIX + "tf-idf";
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return all the options: TF, positions, offsets, idf
|
||||||
|
*/
|
||||||
|
public static final String ALL = TV_PREFIX + "all";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The fields to get term vectors for
|
||||||
|
*/
|
||||||
|
public static final String FIELDS = TV_PREFIX + "fl";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The Doc Ids (Lucene internal ids) of the docs to get the term vectors for
|
||||||
|
*/
|
||||||
|
public static final String DOC_IDS = TV_PREFIX + "docIds";
|
||||||
|
}
|
|
@ -0,0 +1,307 @@
|
||||||
|
package org.apache.solr.handler.component;
|
||||||
|
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.SetBasedFieldSelector;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.index.TermEnum;
|
||||||
|
import org.apache.lucene.index.TermVectorMapper;
|
||||||
|
import org.apache.lucene.index.TermVectorOffsetInfo;
|
||||||
|
import org.apache.solr.common.SolrException;
|
||||||
|
import org.apache.solr.common.params.CommonParams;
|
||||||
|
import org.apache.solr.common.params.ModifiableSolrParams;
|
||||||
|
import org.apache.solr.common.params.SolrParams;
|
||||||
|
import org.apache.solr.common.params.TermVectorParams;
|
||||||
|
import org.apache.solr.common.util.NamedList;
|
||||||
|
import org.apache.solr.common.util.StrUtils;
|
||||||
|
import org.apache.solr.core.SolrCore;
|
||||||
|
import org.apache.solr.schema.IndexSchema;
|
||||||
|
import org.apache.solr.search.DocList;
|
||||||
|
import org.apache.solr.search.DocListAndSet;
|
||||||
|
import org.apache.solr.search.SolrIndexSearcher;
|
||||||
|
import org.apache.solr.util.RefCounted;
|
||||||
|
import org.apache.solr.util.plugin.SolrCoreAware;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.logging.Logger;
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return term vectors for the documents in a query result set.
|
||||||
|
* <p/>
|
||||||
|
* Info available:
|
||||||
|
* term, frequency, position, offset, IDF.
|
||||||
|
* <p/>
|
||||||
|
* <b>Note</b> Returning IDF can be expensive.
|
||||||
|
*/
|
||||||
|
public class TermVectorComponent extends SearchComponent implements SolrCoreAware {
|
||||||
|
private transient static Logger log = Logger.getLogger(TermVectorComponent.class.getName());
|
||||||
|
|
||||||
|
public static final String COMPONENT_NAME = "tv";
|
||||||
|
|
||||||
|
protected NamedList initParams;
|
||||||
|
public static final String TERM_VECTORS = "termVectors";
|
||||||
|
|
||||||
|
|
||||||
|
public void process(ResponseBuilder rb) throws IOException {
|
||||||
|
SolrParams params = rb.req.getParams();
|
||||||
|
if (!params.getBool(COMPONENT_NAME, false)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
NamedList termVectors = new NamedList();
|
||||||
|
rb.rsp.add(TERM_VECTORS, termVectors);
|
||||||
|
//figure out what options we have, and try to get the appropriate vector
|
||||||
|
boolean termFreq = params.getBool(TermVectorParams.TF, false);
|
||||||
|
boolean positions = params.getBool(TermVectorParams.POSITIONS, false);
|
||||||
|
boolean offsets = params.getBool(TermVectorParams.OFFSETS, false);
|
||||||
|
boolean idf = params.getBool(TermVectorParams.IDF, false);
|
||||||
|
boolean tfIdf = params.getBool(TermVectorParams.TF_IDF, false);
|
||||||
|
//boolean cacheIdf = params.getBool(TermVectorParams.IDF, false);
|
||||||
|
|
||||||
|
boolean all = params.getBool(TermVectorParams.ALL, false);
|
||||||
|
if (all == true){
|
||||||
|
termFreq = true;
|
||||||
|
positions = true;
|
||||||
|
offsets = true;
|
||||||
|
idf = true;
|
||||||
|
tfIdf = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
String[] fields = params.getParams(TermVectorParams.FIELDS);
|
||||||
|
if (fields == null) {
|
||||||
|
fields = params.getParams(CommonParams.FL);
|
||||||
|
}
|
||||||
|
DocListAndSet listAndSet = rb.getResults();
|
||||||
|
List<Integer> docIds = getInts(params.getParams(TermVectorParams.DOC_IDS));
|
||||||
|
Iterator<Integer> iter;
|
||||||
|
if (docIds != null && docIds.isEmpty() == false) {
|
||||||
|
iter = docIds.iterator();
|
||||||
|
} else {
|
||||||
|
DocList list = listAndSet.docList;
|
||||||
|
iter = list.iterator();
|
||||||
|
}
|
||||||
|
SolrCore core = rb.req.getCore();
|
||||||
|
RefCounted<SolrIndexSearcher> searcher = core.getSearcher();
|
||||||
|
try {
|
||||||
|
IndexReader reader = searcher.get().getReader();
|
||||||
|
TVMapper mapper = new TVMapper(fields, reader, termFreq, positions, offsets, idf, tfIdf);
|
||||||
|
IndexSchema schema = core.getSchema();
|
||||||
|
String uniqFieldName = schema.getUniqueKeyField().getName();
|
||||||
|
SetBasedFieldSelector fieldSelector = new SetBasedFieldSelector(Collections.singleton(uniqFieldName), Collections.emptySet());
|
||||||
|
while (iter.hasNext()) {
|
||||||
|
Integer docId = iter.next();
|
||||||
|
NamedList docNL = new NamedList();
|
||||||
|
termVectors.add("doc-" + docId, docNL);
|
||||||
|
mapper.docNL = docNL;
|
||||||
|
Document document = reader.document(docId, fieldSelector);
|
||||||
|
String uniqId = document.get(uniqFieldName);
|
||||||
|
docNL.add("uniqueKey", uniqId);
|
||||||
|
reader.getTermFreqVector(docId, mapper);
|
||||||
|
}
|
||||||
|
termVectors.add("uniqueKeyFieldName", uniqFieldName);
|
||||||
|
} finally {
|
||||||
|
searcher.decref();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<Integer> getInts(String[] vals) {
|
||||||
|
List<Integer> result = null;
|
||||||
|
if (vals != null && vals.length > 0) {
|
||||||
|
result = new ArrayList<Integer>(vals.length);
|
||||||
|
for (int i = 0; i < vals.length; i++) {
|
||||||
|
try {
|
||||||
|
result.add(new Integer(vals[i]));
|
||||||
|
} catch (NumberFormatException e) {
|
||||||
|
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e.getMessage(), e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int distributedProcess(ResponseBuilder rb) throws IOException {
|
||||||
|
int result = ResponseBuilder.STAGE_DONE;
|
||||||
|
if (rb.stage == ResponseBuilder.STAGE_GET_FIELDS) {
|
||||||
|
//Go ask each shard for it's vectors
|
||||||
|
// for each shard, collect the documents for that shard.
|
||||||
|
HashMap<String, Collection<ShardDoc>> shardMap = new HashMap<String, Collection<ShardDoc>>();
|
||||||
|
for (ShardDoc sdoc : rb.resultIds.values()) {
|
||||||
|
Collection<ShardDoc> shardDocs = shardMap.get(sdoc.shard);
|
||||||
|
if (shardDocs == null) {
|
||||||
|
shardDocs = new ArrayList<ShardDoc>();
|
||||||
|
shardMap.put(sdoc.shard, shardDocs);
|
||||||
|
}
|
||||||
|
shardDocs.add(sdoc);
|
||||||
|
}
|
||||||
|
// Now create a request for each shard to retrieve the stored fields
|
||||||
|
for (Collection<ShardDoc> shardDocs : shardMap.values()) {
|
||||||
|
ShardRequest sreq = new ShardRequest();
|
||||||
|
sreq.purpose = ShardRequest.PURPOSE_GET_FIELDS;
|
||||||
|
|
||||||
|
sreq.shards = new String[]{shardDocs.iterator().next().shard};
|
||||||
|
|
||||||
|
sreq.params = new ModifiableSolrParams();
|
||||||
|
|
||||||
|
// add original params
|
||||||
|
sreq.params.add(rb.req.getParams());
|
||||||
|
sreq.params.remove(CommonParams.Q);//remove the query
|
||||||
|
ArrayList<String> ids = new ArrayList<String>(shardDocs.size());
|
||||||
|
for (ShardDoc shardDoc : shardDocs) {
|
||||||
|
ids.add(shardDoc.id.toString());
|
||||||
|
}
|
||||||
|
sreq.params.add(TermVectorParams.DOC_IDS, StrUtils.join(ids, ','));
|
||||||
|
|
||||||
|
rb.addRequest(this, sreq);
|
||||||
|
}
|
||||||
|
result = ResponseBuilder.STAGE_DONE;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private class TVMapper extends TermVectorMapper {
|
||||||
|
private NamedList docNL;
|
||||||
|
private IndexReader reader;
|
||||||
|
private Set<String> fields;
|
||||||
|
private boolean termFreq, positions, offsets, idf, tfIdf;
|
||||||
|
//internal vars not passed in by construction
|
||||||
|
private boolean map, useOffsets, usePositions;
|
||||||
|
//private Map<String, Integer> idfCache;
|
||||||
|
private NamedList fieldNL;
|
||||||
|
private Term currentTerm;
|
||||||
|
|
||||||
|
public TVMapper(String[] fields, IndexReader reader, boolean termFreq, boolean positions, boolean offsets, boolean idf, boolean tfIdf) {
|
||||||
|
|
||||||
|
this.reader = reader;
|
||||||
|
this.fields = fields != null ? new HashSet<String>(Arrays.asList(fields)) : Collections.<String>emptySet();
|
||||||
|
this.termFreq = termFreq;
|
||||||
|
this.positions = positions;
|
||||||
|
this.offsets = offsets;
|
||||||
|
this.idf = idf;
|
||||||
|
this.tfIdf = tfIdf;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
|
||||||
|
if (map == true && fieldNL != null) {
|
||||||
|
NamedList termInfo = new NamedList();
|
||||||
|
fieldNL.add(term, termInfo);
|
||||||
|
if (termFreq == true) {
|
||||||
|
termInfo.add("freq", frequency);
|
||||||
|
}
|
||||||
|
if (useOffsets == true) {
|
||||||
|
NamedList theOffsets = new NamedList();
|
||||||
|
termInfo.add("offsets", theOffsets);
|
||||||
|
for (int i = 0; i < offsets.length; i++) {
|
||||||
|
TermVectorOffsetInfo offset = offsets[i];
|
||||||
|
theOffsets.add("start", offset.getStartOffset());
|
||||||
|
theOffsets.add("end", offset.getEndOffset());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (usePositions == true) {
|
||||||
|
NamedList positionsNL = new NamedList();
|
||||||
|
for (int i = 0; i < positions.length; i++) {
|
||||||
|
positionsNL.add("position", positions[i]);
|
||||||
|
}
|
||||||
|
termInfo.add("positions", positionsNL);
|
||||||
|
}
|
||||||
|
if (idf == true) {
|
||||||
|
termInfo.add("idf", getIdf(term));
|
||||||
|
}
|
||||||
|
if (tfIdf == true){
|
||||||
|
double tfIdfVal = ((double) frequency) / getIdf(term);
|
||||||
|
termInfo.add("tf-idf", tfIdfVal);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private int getIdf(String term) {
|
||||||
|
int result = 1;
|
||||||
|
currentTerm = currentTerm.createTerm(term);
|
||||||
|
try {
|
||||||
|
TermEnum termEnum = reader.terms(currentTerm);
|
||||||
|
if (termEnum != null && termEnum.term().equals(currentTerm)) {
|
||||||
|
result = termEnum.docFreq();
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
|
||||||
|
|
||||||
|
if (idf == true && reader != null) {
|
||||||
|
this.currentTerm = new Term(field);
|
||||||
|
}
|
||||||
|
useOffsets = storeOffsets && offsets;
|
||||||
|
usePositions = storePositions && positions;
|
||||||
|
if (fields.isEmpty() || fields.contains(field)) {
|
||||||
|
map = true;
|
||||||
|
fieldNL = new NamedList();
|
||||||
|
docNL.add(field, fieldNL);
|
||||||
|
} else {
|
||||||
|
map = false;
|
||||||
|
fieldNL = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void prepare(ResponseBuilder rb) throws IOException {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////// NamedListInitializedPlugin methods //////////////////////
|
||||||
|
@Override
|
||||||
|
public void init(NamedList args) {
|
||||||
|
super.init(args);
|
||||||
|
this.initParams = args;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void inform(SolrCore core) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getVersion() {
|
||||||
|
return "$Revision$";
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getSourceId() {
|
||||||
|
return "$Id:$";
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getSource() {
|
||||||
|
return "$Revision:$";
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getDescription() {
|
||||||
|
return "A Component for working with Term Vectors";
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,211 @@
|
||||||
|
package org.apache.solr.handler.component;
|
||||||
|
|
||||||
|
import org.apache.solr.util.AbstractSolrTestCase;
|
||||||
|
import org.apache.solr.core.SolrCore;
|
||||||
|
import org.apache.solr.common.params.ModifiableSolrParams;
|
||||||
|
import org.apache.solr.common.params.CommonParams;
|
||||||
|
import org.apache.solr.common.params.TermVectorParams;
|
||||||
|
import org.apache.solr.common.util.NamedList;
|
||||||
|
import org.apache.solr.common.util.SimpleOrderedMap;
|
||||||
|
import org.apache.solr.request.SolrRequestHandler;
|
||||||
|
import org.apache.solr.request.SolrQueryResponse;
|
||||||
|
import org.apache.solr.request.LocalSolrQueryRequest;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
*
|
||||||
|
**/
|
||||||
|
public class TermVectorComponentTest extends AbstractSolrTestCase {
|
||||||
|
@Override
|
||||||
|
public String getSchemaFile() {
|
||||||
|
return "schema.xml";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getSolrConfigFile() {
|
||||||
|
return "solrconfig.xml";
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setUp() throws Exception {
|
||||||
|
super.setUp();
|
||||||
|
assertU(adoc("id", "0", "test_posofftv", "This is a title and another title"));
|
||||||
|
assertU(adoc("id", "1", "test_posofftv",
|
||||||
|
"The quick reb fox jumped over the lazy brown dogs."));
|
||||||
|
assertU(adoc("id", "2", "test_posofftv", "This is a document"));
|
||||||
|
assertU(adoc("id", "3", "test_posofftv", "another document"));
|
||||||
|
//bunch of docs that are variants on blue
|
||||||
|
assertU(adoc("id", "4", "test_posofftv", "blue"));
|
||||||
|
assertU(adoc("id", "5", "test_posofftv", "blud"));
|
||||||
|
assertU(adoc("id", "6", "test_posofftv", "boue"));
|
||||||
|
assertU(adoc("id", "7", "test_posofftv", "glue"));
|
||||||
|
assertU(adoc("id", "8", "test_posofftv", "blee"));
|
||||||
|
assertU(adoc("id", "9", "test_posofftv", "blah"));
|
||||||
|
|
||||||
|
assertU("commit", commit());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testBasics() throws Exception {
|
||||||
|
SolrCore core = h.getCore();
|
||||||
|
SearchComponent tvComp = core.getSearchComponent("tvComponent");
|
||||||
|
assertTrue("tvComp is null and it shouldn't be", tvComp != null);
|
||||||
|
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||||
|
params.add(CommonParams.Q, "id:0");
|
||||||
|
params.add(CommonParams.QT, "tvrh");
|
||||||
|
params.add(TermVectorParams.TF, "true");
|
||||||
|
params.add(TermVectorComponent.COMPONENT_NAME, "true");
|
||||||
|
SolrRequestHandler handler = core.getRequestHandler("tvrh");
|
||||||
|
SolrQueryResponse rsp;
|
||||||
|
rsp = new SolrQueryResponse();
|
||||||
|
rsp.add("responseHeader", new SimpleOrderedMap());
|
||||||
|
handler.handleRequest(new LocalSolrQueryRequest(core, params), rsp);
|
||||||
|
NamedList values = rsp.getValues();
|
||||||
|
NamedList termVectors = (NamedList) values.get(TermVectorComponent.TERM_VECTORS);
|
||||||
|
assertTrue("termVectors is null and it shouldn't be", termVectors != null);
|
||||||
|
System.out.println("TVs:" + termVectors);
|
||||||
|
NamedList doc = (NamedList) termVectors.getVal(0);
|
||||||
|
assertTrue("doc is null and it shouldn't be", doc != null);
|
||||||
|
assertTrue(doc.size() + " does not equal: " + 2, doc.size() == 2);
|
||||||
|
NamedList field = (NamedList) doc.get("test_posofftv");
|
||||||
|
assertTrue("field is null and it shouldn't be", field != null);
|
||||||
|
assertTrue(field.size() + " does not equal: " + 2, field.size() == 2);
|
||||||
|
NamedList titl = (NamedList) field.get("titl");
|
||||||
|
assertTrue("titl is null and it shouldn't be", titl != null);
|
||||||
|
assertTrue(titl.get("freq") + " does not equal: " + 2, ((Integer) titl.get("freq")) == 2);
|
||||||
|
|
||||||
|
String uniqueKeyFieldName = (String) termVectors.getVal(1);
|
||||||
|
assertTrue("uniqueKeyFieldName is null and it shouldn't be", uniqueKeyFieldName != null);
|
||||||
|
assertTrue(uniqueKeyFieldName + " is not equal to " + "id", uniqueKeyFieldName.equals("id") == true);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testOptions() throws Exception {
|
||||||
|
SolrCore core = h.getCore();
|
||||||
|
SearchComponent tvComp = core.getSearchComponent("tvComponent");
|
||||||
|
assertTrue("tvComp is null and it shouldn't be", tvComp != null);
|
||||||
|
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||||
|
params.add(CommonParams.Q, "id:0");
|
||||||
|
params.add(CommonParams.QT, "tvrh");
|
||||||
|
params.add(TermVectorParams.TF, "true");
|
||||||
|
params.add(TermVectorParams.IDF, "true");
|
||||||
|
params.add(TermVectorParams.OFFSETS, "true");
|
||||||
|
params.add(TermVectorParams.POSITIONS, "true");
|
||||||
|
params.add(TermVectorParams.TF_IDF, "true");
|
||||||
|
params.add(TermVectorComponent.COMPONENT_NAME, "true");
|
||||||
|
|
||||||
|
SolrRequestHandler handler = core.getRequestHandler("tvrh");
|
||||||
|
SolrQueryResponse rsp;
|
||||||
|
rsp = new SolrQueryResponse();
|
||||||
|
rsp.add("responseHeader", new SimpleOrderedMap());
|
||||||
|
handler.handleRequest(new LocalSolrQueryRequest(core, params), rsp);
|
||||||
|
NamedList values = rsp.getValues();
|
||||||
|
NamedList termVectors = (NamedList) values.get(TermVectorComponent.TERM_VECTORS);
|
||||||
|
assertTrue("termVectors is null and it shouldn't be", termVectors != null);
|
||||||
|
System.out.println("TVs: " + termVectors);
|
||||||
|
NamedList doc = (NamedList) termVectors.getVal(0);
|
||||||
|
assertTrue("doc is null and it shouldn't be", doc != null);
|
||||||
|
assertTrue(doc.size() + " does not equal: " + 2, doc.size() == 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void testNoFields() throws Exception {
|
||||||
|
SolrCore core = h.getCore();
|
||||||
|
SearchComponent tvComp = core.getSearchComponent("tvComponent");
|
||||||
|
assertTrue("tvComp is null and it shouldn't be", tvComp != null);
|
||||||
|
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||||
|
params.add(CommonParams.Q, "id:0");
|
||||||
|
params.add(CommonParams.QT, "tvrh");
|
||||||
|
params.add(TermVectorParams.TF, "true");
|
||||||
|
//Pass in a field that doesn't exist on the doc, thus, no vectors should be returned
|
||||||
|
params.add(TermVectorParams.FIELDS, "foo");
|
||||||
|
params.add(TermVectorComponent.COMPONENT_NAME, "true");
|
||||||
|
SolrRequestHandler handler = core.getRequestHandler("tvrh");
|
||||||
|
SolrQueryResponse rsp;
|
||||||
|
rsp = new SolrQueryResponse();
|
||||||
|
rsp.add("responseHeader", new SimpleOrderedMap());
|
||||||
|
handler.handleRequest(new LocalSolrQueryRequest(core, params), rsp);
|
||||||
|
NamedList values = rsp.getValues();
|
||||||
|
NamedList termVectors = (NamedList) values.get(TermVectorComponent.TERM_VECTORS);
|
||||||
|
assertTrue("termVectors is null and it shouldn't be", termVectors != null);
|
||||||
|
NamedList doc = (NamedList) termVectors.getVal(0);
|
||||||
|
assertTrue("doc is null and it shouldn't be", doc != null);
|
||||||
|
assertTrue(doc.size() + " does not equal: " + 1, doc.size() == 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testDistributed() throws Exception {
|
||||||
|
SolrCore core = h.getCore();
|
||||||
|
TermVectorComponent tvComp = (TermVectorComponent) core.getSearchComponent("tvComponent");
|
||||||
|
assertTrue("tvComp is null and it shouldn't be", tvComp != null);
|
||||||
|
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||||
|
ResponseBuilder rb = new ResponseBuilder();
|
||||||
|
rb.stage = ResponseBuilder.STAGE_GET_FIELDS;
|
||||||
|
rb.shards = new String[]{"localhost:0", "localhost:1", "localhost:2", "localhost:3"};//we don't actually call these, since we are going to invoke distributedProcess directly
|
||||||
|
rb.resultIds = new HashMap<Object, ShardDoc>();
|
||||||
|
rb.components = new ArrayList<SearchComponent>();
|
||||||
|
rb.components.add(tvComp);
|
||||||
|
params.add(CommonParams.Q, "id:0");
|
||||||
|
params.add(CommonParams.QT, "tvrh");
|
||||||
|
params.add(TermVectorParams.TF, "true");
|
||||||
|
params.add(TermVectorParams.IDF, "true");
|
||||||
|
params.add(TermVectorParams.OFFSETS, "true");
|
||||||
|
params.add(TermVectorParams.POSITIONS, "true");
|
||||||
|
params.add(TermVectorComponent.COMPONENT_NAME, "true");
|
||||||
|
rb.req = new LocalSolrQueryRequest(core, params);
|
||||||
|
rb.outgoing = new ArrayList<ShardRequest>();
|
||||||
|
//one doc per shard, but make sure there are enough docs to go around
|
||||||
|
for (int i = 0; i < rb.shards.length; i++){
|
||||||
|
ShardDoc doc = new ShardDoc();
|
||||||
|
doc.id = i; //must be a valid doc that was indexed.
|
||||||
|
doc.score = 1 - (i / (float)rb.shards.length);
|
||||||
|
doc.positionInResponse = i;
|
||||||
|
doc.shard = rb.shards[i];
|
||||||
|
doc.orderInShard = 0;
|
||||||
|
rb.resultIds.put(doc.id, doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
int result = tvComp.distributedProcess(rb);
|
||||||
|
assertTrue(result + " does not equal: " + ResponseBuilder.STAGE_DONE, result == ResponseBuilder.STAGE_DONE);
|
||||||
|
//one outgoing per shard
|
||||||
|
assertTrue("rb.outgoing Size: " + rb.outgoing.size() + " is not: " + rb.shards.length, rb.outgoing.size() == rb.shards.length);
|
||||||
|
for (ShardRequest request : rb.outgoing) {
|
||||||
|
ModifiableSolrParams solrParams = request.params;
|
||||||
|
System.out.println("Shard: " + Arrays.asList(request.shards) + " Params: " + solrParams);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* <field name="test_basictv" type="text" termVectors="true"/>
|
||||||
|
<field name="test_notv" type="text" termVectors="false"/>
|
||||||
|
<field name="test_postv" type="text" termVectors="true" termPositions="true"/>
|
||||||
|
<field name="test_offtv" type="text" termVectors="true" termOffsets="true"/>
|
||||||
|
<field name="test_posofftv" type="text" termVectors="true"
|
||||||
|
termPositions="true" termOffsets="true"/>
|
||||||
|
*
|
||||||
|
* */
|
|
@ -369,6 +369,18 @@
|
||||||
</arr>
|
</arr>
|
||||||
</requestHandler>
|
</requestHandler>
|
||||||
|
|
||||||
|
|
||||||
|
<searchComponent name="tvComponent" class="org.apache.solr.handler.component.TermVectorComponent"/>
|
||||||
|
|
||||||
|
<requestHandler name="tvrh" class="org.apache.solr.handler.component.SearchHandler">
|
||||||
|
<lst name="defaults">
|
||||||
|
|
||||||
|
</lst>
|
||||||
|
<arr name="last-components">
|
||||||
|
<str>tvComponent</str>
|
||||||
|
</arr>
|
||||||
|
</requestHandler>
|
||||||
|
|
||||||
<highlighting>
|
<highlighting>
|
||||||
<!-- Configure the standard fragmenter -->
|
<!-- Configure the standard fragmenter -->
|
||||||
<fragmenter name="gap" class="org.apache.solr.highlight.GapFragmenter" default="true">
|
<fragmenter name="gap" class="org.apache.solr.highlight.GapFragmenter" default="true">
|
||||||
|
|
Loading…
Reference in New Issue