From 153a59999e6d55944c80674999a5f141b3030a3f Mon Sep 17 00:00:00 2001 From: Grant Ingersoll Date: Thu, 23 Oct 2008 15:49:18 +0000 Subject: [PATCH] SOLR-651: Added in TermVectorComponent git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@707399 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 3 + example/solr/conf/schema.xml | 4 +- example/solr/conf/solrconfig.xml | 16 +- .../solr/common/params/TermVectorParams.java | 50 +++ .../component/TermVectorComponent.java | 307 ++++++++++++++++++ .../component/TermVectorComponentTest.java | 211 ++++++++++++ src/test/test-files/solr/conf/solrconfig.xml | 12 + 7 files changed, 601 insertions(+), 2 deletions(-) create mode 100644 src/java/org/apache/solr/common/params/TermVectorParams.java create mode 100644 src/java/org/apache/solr/handler/component/TermVectorComponent.java create mode 100644 src/test/org/apache/solr/handler/component/TermVectorComponentTest.java diff --git a/CHANGES.txt b/CHANGES.txt index a136acc48db..460da0d33e9 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -70,6 +70,9 @@ New Features 10. SOLR-746: Added "omitHeader" request parameter to omit the header from the response. (Noble Paul via shalin) +11. SOLR-651: Added TermVectorComponent for serving up term vector information, plus IDF. + See http://wiki.apache.org/solr/TermVectorComponent (gsingers, Vaijanath N. Rao, Noble Paul) + Optimizations ---------------------- 1. SOLR-374: Use IndexReader.reopen to save resources by re-using parts of the diff --git a/example/solr/conf/schema.xml b/example/solr/conf/schema.xml index 873a186b359..e82e93488ba 100755 --- a/example/solr/conf/schema.xml +++ b/example/solr/conf/schema.xml @@ -281,6 +281,8 @@ termVectors: [false] set to true to store the term vector for a given field. When using MoreLikeThis, fields used for similarity should be stored for best performance. + termPositions: Store position information with the term vector. This will increase storage costs. + termOffsets: Store offset information with the term vector. This will increase storage costs. --> @@ -290,7 +292,7 @@ - + diff --git a/example/solr/conf/solrconfig.xml b/example/solr/conf/solrconfig.xml index 59d0779a337..cc11b1e27fe 100755 --- a/example/solr/conf/solrconfig.xml +++ b/example/solr/conf/solrconfig.xml @@ -574,7 +574,8 @@ - + @@ -589,6 +590,19 @@ + + + + + true + + + tvComponent + + + + diff --git a/src/java/org/apache/solr/common/params/TermVectorParams.java b/src/java/org/apache/solr/common/params/TermVectorParams.java new file mode 100644 index 00000000000..a8401c3086d --- /dev/null +++ b/src/java/org/apache/solr/common/params/TermVectorParams.java @@ -0,0 +1,50 @@ +package org.apache.solr.common.params; + + +/** + * + * + **/ +public interface TermVectorParams { + + public static final String TV_PREFIX = "tv."; + + /** + * Return Term Frequency info + * */ + public static final String TF = TV_PREFIX + "tf"; + /** + * Return Term Vector position information + * + * */ + public static final String POSITIONS = TV_PREFIX + "positions"; + /** + * Return offset information, if available + * */ + public static final String OFFSETS = TV_PREFIX + "offsets"; + /** + * Return IDF information. May be expensive + * */ + public static final String IDF = TV_PREFIX + "idf"; + + /** + * Return TF-IDF calculation, i.e. (tf / idf). May be expensive. + */ + public static final String TF_IDF = TV_PREFIX + "tf-idf"; + + + /** + * Return all the options: TF, positions, offsets, idf + */ + public static final String ALL = TV_PREFIX + "all"; + + /** + * The fields to get term vectors for + */ + public static final String FIELDS = TV_PREFIX + "fl"; + + /** + * The Doc Ids (Lucene internal ids) of the docs to get the term vectors for + */ + public static final String DOC_IDS = TV_PREFIX + "docIds"; +} diff --git a/src/java/org/apache/solr/handler/component/TermVectorComponent.java b/src/java/org/apache/solr/handler/component/TermVectorComponent.java new file mode 100644 index 00000000000..65f90de3abb --- /dev/null +++ b/src/java/org/apache/solr/handler/component/TermVectorComponent.java @@ -0,0 +1,307 @@ +package org.apache.solr.handler.component; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.SetBasedFieldSelector; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.TermVectorMapper; +import org.apache.lucene.index.TermVectorOffsetInfo; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.params.CommonParams; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.params.TermVectorParams; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.common.util.StrUtils; +import org.apache.solr.core.SolrCore; +import org.apache.solr.schema.IndexSchema; +import org.apache.solr.search.DocList; +import org.apache.solr.search.DocListAndSet; +import org.apache.solr.search.SolrIndexSearcher; +import org.apache.solr.util.RefCounted; +import org.apache.solr.util.plugin.SolrCoreAware; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.logging.Logger; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * Return term vectors for the documents in a query result set. + *

+ * Info available: + * term, frequency, position, offset, IDF. + *

+ * Note Returning IDF can be expensive. + */ +public class TermVectorComponent extends SearchComponent implements SolrCoreAware { + private transient static Logger log = Logger.getLogger(TermVectorComponent.class.getName()); + + public static final String COMPONENT_NAME = "tv"; + + protected NamedList initParams; + public static final String TERM_VECTORS = "termVectors"; + + + public void process(ResponseBuilder rb) throws IOException { + SolrParams params = rb.req.getParams(); + if (!params.getBool(COMPONENT_NAME, false)) { + return; + } + + NamedList termVectors = new NamedList(); + rb.rsp.add(TERM_VECTORS, termVectors); + //figure out what options we have, and try to get the appropriate vector + boolean termFreq = params.getBool(TermVectorParams.TF, false); + boolean positions = params.getBool(TermVectorParams.POSITIONS, false); + boolean offsets = params.getBool(TermVectorParams.OFFSETS, false); + boolean idf = params.getBool(TermVectorParams.IDF, false); + boolean tfIdf = params.getBool(TermVectorParams.TF_IDF, false); + //boolean cacheIdf = params.getBool(TermVectorParams.IDF, false); + + boolean all = params.getBool(TermVectorParams.ALL, false); + if (all == true){ + termFreq = true; + positions = true; + offsets = true; + idf = true; + tfIdf = true; + } + + String[] fields = params.getParams(TermVectorParams.FIELDS); + if (fields == null) { + fields = params.getParams(CommonParams.FL); + } + DocListAndSet listAndSet = rb.getResults(); + List docIds = getInts(params.getParams(TermVectorParams.DOC_IDS)); + Iterator iter; + if (docIds != null && docIds.isEmpty() == false) { + iter = docIds.iterator(); + } else { + DocList list = listAndSet.docList; + iter = list.iterator(); + } + SolrCore core = rb.req.getCore(); + RefCounted searcher = core.getSearcher(); + try { + IndexReader reader = searcher.get().getReader(); + TVMapper mapper = new TVMapper(fields, reader, termFreq, positions, offsets, idf, tfIdf); + IndexSchema schema = core.getSchema(); + String uniqFieldName = schema.getUniqueKeyField().getName(); + SetBasedFieldSelector fieldSelector = new SetBasedFieldSelector(Collections.singleton(uniqFieldName), Collections.emptySet()); + while (iter.hasNext()) { + Integer docId = iter.next(); + NamedList docNL = new NamedList(); + termVectors.add("doc-" + docId, docNL); + mapper.docNL = docNL; + Document document = reader.document(docId, fieldSelector); + String uniqId = document.get(uniqFieldName); + docNL.add("uniqueKey", uniqId); + reader.getTermFreqVector(docId, mapper); + } + termVectors.add("uniqueKeyFieldName", uniqFieldName); + } finally { + searcher.decref(); + } + } + + private List getInts(String[] vals) { + List result = null; + if (vals != null && vals.length > 0) { + result = new ArrayList(vals.length); + for (int i = 0; i < vals.length; i++) { + try { + result.add(new Integer(vals[i])); + } catch (NumberFormatException e) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e.getMessage(), e); + } + } + } + return result; + } + + @Override + public int distributedProcess(ResponseBuilder rb) throws IOException { + int result = ResponseBuilder.STAGE_DONE; + if (rb.stage == ResponseBuilder.STAGE_GET_FIELDS) { + //Go ask each shard for it's vectors + // for each shard, collect the documents for that shard. + HashMap> shardMap = new HashMap>(); + for (ShardDoc sdoc : rb.resultIds.values()) { + Collection shardDocs = shardMap.get(sdoc.shard); + if (shardDocs == null) { + shardDocs = new ArrayList(); + shardMap.put(sdoc.shard, shardDocs); + } + shardDocs.add(sdoc); + } + // Now create a request for each shard to retrieve the stored fields + for (Collection shardDocs : shardMap.values()) { + ShardRequest sreq = new ShardRequest(); + sreq.purpose = ShardRequest.PURPOSE_GET_FIELDS; + + sreq.shards = new String[]{shardDocs.iterator().next().shard}; + + sreq.params = new ModifiableSolrParams(); + + // add original params + sreq.params.add(rb.req.getParams()); + sreq.params.remove(CommonParams.Q);//remove the query + ArrayList ids = new ArrayList(shardDocs.size()); + for (ShardDoc shardDoc : shardDocs) { + ids.add(shardDoc.id.toString()); + } + sreq.params.add(TermVectorParams.DOC_IDS, StrUtils.join(ids, ',')); + + rb.addRequest(this, sreq); + } + result = ResponseBuilder.STAGE_DONE; + } + return result; + } + + private class TVMapper extends TermVectorMapper { + private NamedList docNL; + private IndexReader reader; + private Set fields; + private boolean termFreq, positions, offsets, idf, tfIdf; + //internal vars not passed in by construction + private boolean map, useOffsets, usePositions; + //private Map idfCache; + private NamedList fieldNL; + private Term currentTerm; + + public TVMapper(String[] fields, IndexReader reader, boolean termFreq, boolean positions, boolean offsets, boolean idf, boolean tfIdf) { + + this.reader = reader; + this.fields = fields != null ? new HashSet(Arrays.asList(fields)) : Collections.emptySet(); + this.termFreq = termFreq; + this.positions = positions; + this.offsets = offsets; + this.idf = idf; + this.tfIdf = tfIdf; + + } + + public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { + if (map == true && fieldNL != null) { + NamedList termInfo = new NamedList(); + fieldNL.add(term, termInfo); + if (termFreq == true) { + termInfo.add("freq", frequency); + } + if (useOffsets == true) { + NamedList theOffsets = new NamedList(); + termInfo.add("offsets", theOffsets); + for (int i = 0; i < offsets.length; i++) { + TermVectorOffsetInfo offset = offsets[i]; + theOffsets.add("start", offset.getStartOffset()); + theOffsets.add("end", offset.getEndOffset()); + } + } + if (usePositions == true) { + NamedList positionsNL = new NamedList(); + for (int i = 0; i < positions.length; i++) { + positionsNL.add("position", positions[i]); + } + termInfo.add("positions", positionsNL); + } + if (idf == true) { + termInfo.add("idf", getIdf(term)); + } + if (tfIdf == true){ + double tfIdfVal = ((double) frequency) / getIdf(term); + termInfo.add("tf-idf", tfIdfVal); + } + } + } + + private int getIdf(String term) { + int result = 1; + currentTerm = currentTerm.createTerm(term); + try { + TermEnum termEnum = reader.terms(currentTerm); + if (termEnum != null && termEnum.term().equals(currentTerm)) { + result = termEnum.docFreq(); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + return result; + } + + public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) { + + if (idf == true && reader != null) { + this.currentTerm = new Term(field); + } + useOffsets = storeOffsets && offsets; + usePositions = storePositions && positions; + if (fields.isEmpty() || fields.contains(field)) { + map = true; + fieldNL = new NamedList(); + docNL.add(field, fieldNL); + } else { + map = false; + fieldNL = null; + } + } + } + + public void prepare(ResponseBuilder rb) throws IOException { + + } + + //////////////////////// NamedListInitializedPlugin methods ////////////////////// + @Override + public void init(NamedList args) { + super.init(args); + this.initParams = args; + } + + public void inform(SolrCore core) { + + } + + public String getVersion() { + return "$Revision$"; + } + + public String getSourceId() { + return "$Id:$"; + } + + public String getSource() { + return "$Revision:$"; + } + + public String getDescription() { + return "A Component for working with Term Vectors"; + } +} diff --git a/src/test/org/apache/solr/handler/component/TermVectorComponentTest.java b/src/test/org/apache/solr/handler/component/TermVectorComponentTest.java new file mode 100644 index 00000000000..469b3ec7082 --- /dev/null +++ b/src/test/org/apache/solr/handler/component/TermVectorComponentTest.java @@ -0,0 +1,211 @@ +package org.apache.solr.handler.component; + +import org.apache.solr.util.AbstractSolrTestCase; +import org.apache.solr.core.SolrCore; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.common.params.CommonParams; +import org.apache.solr.common.params.TermVectorParams; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.common.util.SimpleOrderedMap; +import org.apache.solr.request.SolrRequestHandler; +import org.apache.solr.request.SolrQueryResponse; +import org.apache.solr.request.LocalSolrQueryRequest; + +import java.util.HashMap; +import java.util.ArrayList; +import java.util.Arrays; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * + * + **/ +public class TermVectorComponentTest extends AbstractSolrTestCase { + @Override + public String getSchemaFile() { + return "schema.xml"; + } + + @Override + public String getSolrConfigFile() { + return "solrconfig.xml"; + } + + @Override + public void setUp() throws Exception { + super.setUp(); + assertU(adoc("id", "0", "test_posofftv", "This is a title and another title")); + assertU(adoc("id", "1", "test_posofftv", + "The quick reb fox jumped over the lazy brown dogs.")); + assertU(adoc("id", "2", "test_posofftv", "This is a document")); + assertU(adoc("id", "3", "test_posofftv", "another document")); + //bunch of docs that are variants on blue + assertU(adoc("id", "4", "test_posofftv", "blue")); + assertU(adoc("id", "5", "test_posofftv", "blud")); + assertU(adoc("id", "6", "test_posofftv", "boue")); + assertU(adoc("id", "7", "test_posofftv", "glue")); + assertU(adoc("id", "8", "test_posofftv", "blee")); + assertU(adoc("id", "9", "test_posofftv", "blah")); + + assertU("commit", commit()); + } + + public void testBasics() throws Exception { + SolrCore core = h.getCore(); + SearchComponent tvComp = core.getSearchComponent("tvComponent"); + assertTrue("tvComp is null and it shouldn't be", tvComp != null); + ModifiableSolrParams params = new ModifiableSolrParams(); + params.add(CommonParams.Q, "id:0"); + params.add(CommonParams.QT, "tvrh"); + params.add(TermVectorParams.TF, "true"); + params.add(TermVectorComponent.COMPONENT_NAME, "true"); + SolrRequestHandler handler = core.getRequestHandler("tvrh"); + SolrQueryResponse rsp; + rsp = new SolrQueryResponse(); + rsp.add("responseHeader", new SimpleOrderedMap()); + handler.handleRequest(new LocalSolrQueryRequest(core, params), rsp); + NamedList values = rsp.getValues(); + NamedList termVectors = (NamedList) values.get(TermVectorComponent.TERM_VECTORS); + assertTrue("termVectors is null and it shouldn't be", termVectors != null); + System.out.println("TVs:" + termVectors); + NamedList doc = (NamedList) termVectors.getVal(0); + assertTrue("doc is null and it shouldn't be", doc != null); + assertTrue(doc.size() + " does not equal: " + 2, doc.size() == 2); + NamedList field = (NamedList) doc.get("test_posofftv"); + assertTrue("field is null and it shouldn't be", field != null); + assertTrue(field.size() + " does not equal: " + 2, field.size() == 2); + NamedList titl = (NamedList) field.get("titl"); + assertTrue("titl is null and it shouldn't be", titl != null); + assertTrue(titl.get("freq") + " does not equal: " + 2, ((Integer) titl.get("freq")) == 2); + + String uniqueKeyFieldName = (String) termVectors.getVal(1); + assertTrue("uniqueKeyFieldName is null and it shouldn't be", uniqueKeyFieldName != null); + assertTrue(uniqueKeyFieldName + " is not equal to " + "id", uniqueKeyFieldName.equals("id") == true); + + } + + public void testOptions() throws Exception { + SolrCore core = h.getCore(); + SearchComponent tvComp = core.getSearchComponent("tvComponent"); + assertTrue("tvComp is null and it shouldn't be", tvComp != null); + ModifiableSolrParams params = new ModifiableSolrParams(); + params.add(CommonParams.Q, "id:0"); + params.add(CommonParams.QT, "tvrh"); + params.add(TermVectorParams.TF, "true"); + params.add(TermVectorParams.IDF, "true"); + params.add(TermVectorParams.OFFSETS, "true"); + params.add(TermVectorParams.POSITIONS, "true"); + params.add(TermVectorParams.TF_IDF, "true"); + params.add(TermVectorComponent.COMPONENT_NAME, "true"); + + SolrRequestHandler handler = core.getRequestHandler("tvrh"); + SolrQueryResponse rsp; + rsp = new SolrQueryResponse(); + rsp.add("responseHeader", new SimpleOrderedMap()); + handler.handleRequest(new LocalSolrQueryRequest(core, params), rsp); + NamedList values = rsp.getValues(); + NamedList termVectors = (NamedList) values.get(TermVectorComponent.TERM_VECTORS); + assertTrue("termVectors is null and it shouldn't be", termVectors != null); + System.out.println("TVs: " + termVectors); + NamedList doc = (NamedList) termVectors.getVal(0); + assertTrue("doc is null and it shouldn't be", doc != null); + assertTrue(doc.size() + " does not equal: " + 2, doc.size() == 2); + } + + + public void testNoFields() throws Exception { + SolrCore core = h.getCore(); + SearchComponent tvComp = core.getSearchComponent("tvComponent"); + assertTrue("tvComp is null and it shouldn't be", tvComp != null); + ModifiableSolrParams params = new ModifiableSolrParams(); + params.add(CommonParams.Q, "id:0"); + params.add(CommonParams.QT, "tvrh"); + params.add(TermVectorParams.TF, "true"); + //Pass in a field that doesn't exist on the doc, thus, no vectors should be returned + params.add(TermVectorParams.FIELDS, "foo"); + params.add(TermVectorComponent.COMPONENT_NAME, "true"); + SolrRequestHandler handler = core.getRequestHandler("tvrh"); + SolrQueryResponse rsp; + rsp = new SolrQueryResponse(); + rsp.add("responseHeader", new SimpleOrderedMap()); + handler.handleRequest(new LocalSolrQueryRequest(core, params), rsp); + NamedList values = rsp.getValues(); + NamedList termVectors = (NamedList) values.get(TermVectorComponent.TERM_VECTORS); + assertTrue("termVectors is null and it shouldn't be", termVectors != null); + NamedList doc = (NamedList) termVectors.getVal(0); + assertTrue("doc is null and it shouldn't be", doc != null); + assertTrue(doc.size() + " does not equal: " + 1, doc.size() == 1); + } + + public void testDistributed() throws Exception { + SolrCore core = h.getCore(); + TermVectorComponent tvComp = (TermVectorComponent) core.getSearchComponent("tvComponent"); + assertTrue("tvComp is null and it shouldn't be", tvComp != null); + ModifiableSolrParams params = new ModifiableSolrParams(); + ResponseBuilder rb = new ResponseBuilder(); + rb.stage = ResponseBuilder.STAGE_GET_FIELDS; + rb.shards = new String[]{"localhost:0", "localhost:1", "localhost:2", "localhost:3"};//we don't actually call these, since we are going to invoke distributedProcess directly + rb.resultIds = new HashMap(); + rb.components = new ArrayList(); + rb.components.add(tvComp); + params.add(CommonParams.Q, "id:0"); + params.add(CommonParams.QT, "tvrh"); + params.add(TermVectorParams.TF, "true"); + params.add(TermVectorParams.IDF, "true"); + params.add(TermVectorParams.OFFSETS, "true"); + params.add(TermVectorParams.POSITIONS, "true"); + params.add(TermVectorComponent.COMPONENT_NAME, "true"); + rb.req = new LocalSolrQueryRequest(core, params); + rb.outgoing = new ArrayList(); + //one doc per shard, but make sure there are enough docs to go around + for (int i = 0; i < rb.shards.length; i++){ + ShardDoc doc = new ShardDoc(); + doc.id = i; //must be a valid doc that was indexed. + doc.score = 1 - (i / (float)rb.shards.length); + doc.positionInResponse = i; + doc.shard = rb.shards[i]; + doc.orderInShard = 0; + rb.resultIds.put(doc.id, doc); + } + + int result = tvComp.distributedProcess(rb); + assertTrue(result + " does not equal: " + ResponseBuilder.STAGE_DONE, result == ResponseBuilder.STAGE_DONE); + //one outgoing per shard + assertTrue("rb.outgoing Size: " + rb.outgoing.size() + " is not: " + rb.shards.length, rb.outgoing.size() == rb.shards.length); + for (ShardRequest request : rb.outgoing) { + ModifiableSolrParams solrParams = request.params; + System.out.println("Shard: " + Arrays.asList(request.shards) + " Params: " + solrParams); + } + } + +} + + + + + +/* +* + + + + +* +* */ diff --git a/src/test/test-files/solr/conf/solrconfig.xml b/src/test/test-files/solr/conf/solrconfig.xml index 8f6a1e3dea0..bfed2ca4130 100644 --- a/src/test/test-files/solr/conf/solrconfig.xml +++ b/src/test/test-files/solr/conf/solrconfig.xml @@ -369,6 +369,18 @@ + + + + + + + + + tvComponent + + +