SOLR-1556: added per field capabilities to TermVectorComponent, also error msgs

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@960204 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2010-07-03 11:44:27 +00:00
parent f8056de0b0
commit 5ba4331613
3 changed files with 334 additions and 91 deletions

View File

@ -188,6 +188,12 @@ New Features
* SOLR-1974: Add LimitTokenCountFilterFactory. (koji)
* SOLR-1966: QueryElevationComponent can now return just the included results in the elevation file (gsingers, yonik)
* SOLR-1556: TermVectorComponent now supports per field overrides. Also, it now throws an error
if passed in fields do not exist and warnings
if fields that do not have term vector options (termVectors, offsets, positions)
that align with the schema declaration. It also
will now return warnings about (gsingers)
Optimizations
----------------------

View File

@ -1,14 +1,16 @@
package org.apache.solr.handler.component;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.document.SetBasedFieldSelector;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermVectorMapper;
import org.apache.lucene.index.TermVectorOffsetInfo;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.CommonParams;
@ -19,9 +21,11 @@ import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocList;
import org.apache.solr.search.DocListAndSet;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.SolrPluginUtils;
import org.apache.solr.util.plugin.SolrCoreAware;
import java.io.IOException;
@ -33,6 +37,7 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
@ -78,27 +83,90 @@ public class TermVectorComponent extends SearchComponent implements SolrCoreAwar
NamedList termVectors = new NamedList();
rb.rsp.add(TERM_VECTORS, termVectors);
FieldOptions allFields = new FieldOptions();
//figure out what options we have, and try to get the appropriate vector
boolean termFreq = params.getBool(TermVectorParams.TF, false);
boolean positions = params.getBool(TermVectorParams.POSITIONS, false);
boolean offsets = params.getBool(TermVectorParams.OFFSETS, false);
boolean docFreq = params.getBool(TermVectorParams.DF, false);
boolean tfIdf = params.getBool(TermVectorParams.TF_IDF, false);
allFields.termFreq = params.getBool(TermVectorParams.TF, false);
allFields.positions = params.getBool(TermVectorParams.POSITIONS, false);
allFields.offsets = params.getBool(TermVectorParams.OFFSETS, false);
allFields.docFreq = params.getBool(TermVectorParams.DF, false);
allFields.tfIdf = params.getBool(TermVectorParams.TF_IDF, false);
//boolean cacheIdf = params.getBool(TermVectorParams.IDF, false);
//short cut to all values.
boolean all = params.getBool(TermVectorParams.ALL, false);
if (all == true){
termFreq = true;
positions = true;
offsets = true;
docFreq = true;
tfIdf = true;
if (all == true) {
allFields.termFreq = true;
allFields.positions = true;
allFields.offsets = true;
allFields.docFreq = true;
allFields.tfIdf = true;
}
String[] fields = params.getParams(TermVectorParams.FIELDS);
if (fields == null) {
fields = params.getParams(CommonParams.FL);
String fldLst = params.get(TermVectorParams.FIELDS);
if (fldLst == null) {
fldLst = params.get(CommonParams.FL);
}
//use this to validate our fields
IndexSchema schema = rb.req.getSchema();
//Build up our per field mapping
Map<String, FieldOptions> fieldOptions = new HashMap<String, FieldOptions>();
NamedList warnings = new NamedList();
List<String> noTV = new ArrayList<String>();
List<String> noPos = new ArrayList<String>();
List<String> noOff = new ArrayList<String>();
//we have specific fields to retrieve
if (fldLst != null) {
String [] fields = SolrPluginUtils.split(fldLst);
for (String field : fields) {
SchemaField sf = schema.getFieldOrNull(field);
if (sf != null) {
if (sf.storeTermVector()) {
FieldOptions option = fieldOptions.get(field);
if (option == null) {
option = new FieldOptions();
option.fieldName = field;
fieldOptions.put(field, option);
}
//get the per field mappings
option.termFreq = params.getFieldBool(field, TermVectorParams.TF, allFields.termFreq);
option.docFreq = params.getFieldBool(field, TermVectorParams.DF, allFields.docFreq);
option.tfIdf = params.getFieldBool(field, TermVectorParams.TF_IDF, allFields.tfIdf);
//Validate these are even an option
option.positions = params.getFieldBool(field, TermVectorParams.POSITIONS, allFields.positions);
if (option.positions == true && sf.storeTermPositions() == false){
noPos.add(field);
}
option.offsets = params.getFieldBool(field, TermVectorParams.OFFSETS, allFields.offsets);
if (option.offsets == true && sf.storeTermOffsets() == false){
noOff.add(field);
}
} else {//field doesn't have term vectors
noTV.add(field);
}
} else {
//field doesn't exist
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "undefined field: " + field);
}
}
} //else, deal with all fields
boolean hasWarnings = false;
if (noTV.isEmpty() == false) {
warnings.add("noTermVectors", noTV);
hasWarnings = true;
}
if (noPos.isEmpty() == false) {
warnings.add("noPositions", noPos);
hasWarnings = true;
}
if (noOff.isEmpty() == false) {
warnings.add("noOffsets", noOff);
hasWarnings = true;
}
if (hasWarnings == true) {
termVectors.add("warnings", warnings);
}
DocListAndSet listAndSet = rb.getResults();
List<Integer> docIds = getInts(params.getParams(TermVectorParams.DOC_IDS));
Iterator<Integer> iter;
@ -112,22 +180,43 @@ public class TermVectorComponent extends SearchComponent implements SolrCoreAwar
IndexReader reader = searcher.getReader();
//the TVMapper is a TermVectorMapper which can be used to optimize loading of Term Vectors
TVMapper mapper = new TVMapper(fields, reader, termFreq, positions, offsets, docFreq, tfIdf);
IndexSchema schema = rb.req.getSchema();
String uniqFieldName = schema.getUniqueKeyField().getName();
//Only load the id field
SchemaField keyField = schema.getUniqueKeyField();
String uniqFieldName = null;
if (keyField != null) {
uniqFieldName = keyField.getName();
}
//Only load the id field to get the uniqueKey of that field
SetBasedFieldSelector fieldSelector = new SetBasedFieldSelector(Collections.singleton(uniqFieldName), Collections.<String>emptySet());
TVMapper mapper = new TVMapper(reader);
mapper.fieldOptions = allFields; //this will only stay set if fieldOptions.isEmpty() (in other words, only if the user didn't set any fields)
while (iter.hasNext()) {
Integer docId = iter.next();
NamedList docNL = new NamedList();
termVectors.add("doc-" + docId, docNL);
mapper.docNL = docNL;
Document document = reader.document(docId, fieldSelector);
String uniqId = document.get(uniqFieldName);
docNL.add("uniqueKey", uniqId);
reader.getTermFreqVector(docId, mapper);
termVectors.add("doc-" + docId, docNL);
if (keyField != null) {
Document document = reader.document(docId, fieldSelector);
Fieldable uniqId = document.getField(uniqFieldName);
String uniqVal = null;
if (uniqId != null) {
uniqVal = keyField.getType().storedToReadable(uniqId);
}
if (uniqVal != null) {
docNL.add("uniqueKey", uniqVal);
termVectors.add("uniqueKeyFieldName", uniqFieldName);
}
}
if (fieldOptions.isEmpty() == false) {
for (Map.Entry<String, FieldOptions> entry : fieldOptions.entrySet()) {
mapper.fieldOptions = entry.getValue();
reader.getTermFreqVector(docId, entry.getKey(), mapper);
}
} else {
//deal with all fields by using the allFieldMapper
reader.getTermFreqVector(docId, mapper);
}
}
termVectors.add("uniqueKeyFieldName", uniqFieldName);
}
private List<Integer> getInts(String[] vals) {
@ -186,43 +275,27 @@ public class TermVectorComponent extends SearchComponent implements SolrCoreAwar
}
private static class TVMapper extends TermVectorMapper {
private NamedList docNL;
private IndexReader reader;
private Set<String> fields;
private boolean termFreq, positions, offsets, docFreq, tfIdf;
private NamedList docNL;
//needs to be set for each new field
FieldOptions fieldOptions;
//internal vars not passed in by construction
private boolean map, useOffsets, usePositions;
private boolean useOffsets, usePositions;
//private Map<String, Integer> idfCache;
private NamedList fieldNL;
private Term currentTerm;
/**
*
* @param fields
* @param reader
* @param termFreq
* @param positions true if the TVM should try to get position info from the Term Vector, assuming it is present
* @param offsets true if the TVM should try to get offset info from the Term Vector, assuming it is present
* @param docFreq
* @param tfIdf
*/
public TVMapper(String[] fields, IndexReader reader, boolean termFreq, boolean positions, boolean offsets, boolean docFreq, boolean tfIdf) {
public TVMapper(IndexReader reader) {
this.reader = reader;
this.fields = fields != null ? new HashSet<String>(Arrays.asList(fields)) : Collections.<String>emptySet();
this.termFreq = termFreq;
this.positions = positions;
this.offsets = offsets;
this.docFreq = docFreq;
this.tfIdf = tfIdf;
}
public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
if (map == true && fieldNL != null) {
NamedList termInfo = new NamedList();
NamedList termInfo = new NamedList();
fieldNL.add(term, termInfo);
if (termFreq == true) {
if (fieldOptions.termFreq == true) {
termInfo.add("tf", frequency);
}
if (useOffsets == true) {
@ -237,18 +310,17 @@ public class TermVectorComponent extends SearchComponent implements SolrCoreAwar
if (usePositions == true) {
NamedList positionsNL = new NamedList();
for (int i = 0; i < positions.length; i++) {
positionsNL.add("position", positions[i]);
positionsNL.add("position", positions[i]);
}
termInfo.add("positions", positionsNL);
}
if (docFreq == true) {
if (fieldOptions.docFreq == true) {
termInfo.add("df", getDocFreq(term));
}
if (tfIdf == true){
if (fieldOptions.tfIdf == true) {
double tfIdfVal = ((double) frequency) / getDocFreq(term);
termInfo.add("tf-idf", tfIdfVal);
}
}
}
private int getDocFreq(String term) {
@ -270,29 +342,23 @@ public class TermVectorComponent extends SearchComponent implements SolrCoreAwar
public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
if (docFreq == true && reader != null) {
if (fieldOptions.docFreq == true && reader != null) {
this.currentTerm = new Term(field);
}
useOffsets = storeOffsets && offsets;
usePositions = storePositions && positions;
if (fields.isEmpty() || fields.contains(field)) {
map = true;
fieldNL = new NamedList();
docNL.add(field, fieldNL);
} else {
map = false;
fieldNL = null;
}
useOffsets = storeOffsets && fieldOptions.offsets;
usePositions = storePositions && fieldOptions.positions;
fieldNL = new NamedList();
docNL.add(field, fieldNL);
}
@Override
public boolean isIgnoringPositions() {
return this.positions == false; // if we are not interested in positions, then return true telling Lucene to skip loading them
return fieldOptions.positions == false; // if we are not interested in positions, then return true telling Lucene to skip loading them
}
@Override
public boolean isIgnoringOffsets() {
return this.offsets == false; // if we are not interested in offsets, then return true telling Lucene to skip loading them
return fieldOptions.offsets == false; // if we are not interested in offsets, then return true telling Lucene to skip loading them
}
}
@ -301,6 +367,7 @@ public class TermVectorComponent extends SearchComponent implements SolrCoreAwar
}
//////////////////////// NamedListInitializedPlugin methods //////////////////////
@Override
public void init(NamedList args) {
super.init(args);
@ -327,3 +394,8 @@ public class TermVectorComponent extends SearchComponent implements SolrCoreAwar
return "A Component for working with Term Vectors";
}
}
class FieldOptions {
String fieldName;
boolean termFreq, positions, offsets, docFreq, tfIdf;
}

View File

@ -1,7 +1,6 @@
package org.apache.solr.handler.component;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.solr.core.SolrCore;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.CommonParams;
@ -19,6 +18,7 @@ import static org.junit.Assert.*;
import java.util.HashMap;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -46,18 +46,77 @@ public class TermVectorComponentTest extends SolrTestCaseJ4 {
public static void beforeClass() throws Exception {
initCore("solrconfig.xml","schema.xml");
assertNull(h.validateUpdate(adoc("id", "0", "test_posofftv", "This is a title and another title")));
assertNull(h.validateUpdate(adoc("id", "1", "test_posofftv",
"The quick reb fox jumped over the lazy brown dogs.")));
assertNull(h.validateUpdate(adoc("id", "2", "test_posofftv", "This is a document")));
assertNull(h.validateUpdate(adoc("id", "3", "test_posofftv", "another document")));
assertNull(h.validateUpdate(adoc("id", "0",
"test_posofftv", "This is a title and another title",
"test_basictv", "This is a title and another title",
"test_notv", "This is a title and another title",
"test_postv", "This is a title and another title",
"test_offtv", "This is a title and another title"
)));
assertNull(h.validateUpdate(adoc("id", "1",
"test_posofftv", "The quick reb fox jumped over the lazy brown dogs.",
"test_basictv", "The quick reb fox jumped over the lazy brown dogs.",
"test_notv", "The quick reb fox jumped over the lazy brown dogs.",
"test_postv", "The quick reb fox jumped over the lazy brown dogs.",
"test_offtv", "The quick reb fox jumped over the lazy brown dogs."
)));
assertNull(h.validateUpdate(adoc("id", "2",
"test_posofftv", "This is a document",
"test_basictv", "This is a document",
"test_notv", "This is a document",
"test_postv", "This is a document",
"test_offtv", "This is a document"
)));
assertNull(h.validateUpdate(adoc("id", "3",
"test_posofftv", "another document",
"test_basictv", "another document",
"test_notv", "another document",
"test_postv", "another document",
"test_offtv", "another document"
)));
//bunch of docs that are variants on blue
assertNull(h.validateUpdate(adoc("id", "4", "test_posofftv", "blue")));
assertNull(h.validateUpdate(adoc("id", "5", "test_posofftv", "blud")));
assertNull(h.validateUpdate(adoc("id", "6", "test_posofftv", "boue")));
assertNull(h.validateUpdate(adoc("id", "7", "test_posofftv", "glue")));
assertNull(h.validateUpdate(adoc("id", "8", "test_posofftv", "blee")));
assertNull(h.validateUpdate(adoc("id", "9", "test_posofftv", "blah")));
assertNull(h.validateUpdate(adoc("id", "4",
"test_posofftv", "blue",
"test_basictv", "blue",
"test_notv", "blue",
"test_postv", "blue",
"test_offtv", "blue"
)));
assertNull(h.validateUpdate(adoc("id", "5",
"test_posofftv", "blud",
"test_basictv", "blud",
"test_notv", "blud",
"test_postv", "blud",
"test_offtv", "blud"
)));
assertNull(h.validateUpdate(adoc("id", "6",
"test_posofftv", "boue",
"test_basictv", "boue",
"test_notv", "boue",
"test_postv", "boue",
"test_offtv", "boue"
)));
assertNull(h.validateUpdate(adoc("id", "7",
"test_posofftv", "glue",
"test_basictv", "glue",
"test_notv", "glue",
"test_postv", "glue",
"test_offtv", "glue"
)));
assertNull(h.validateUpdate(adoc("id", "8",
"test_posofftv", "blee",
"test_basictv", "blee",
"test_notv", "blee",
"test_postv", "blee",
"test_offtv", "blee"
)));
assertNull(h.validateUpdate(adoc("id", "9",
"test_posofftv", "blah",
"test_basictv", "blah",
"test_notv", "blah",
"test_postv", "blah",
"test_offtv", "blah"
)));
assertNull(h.validateUpdate(commit()));
}
@ -80,10 +139,10 @@ public class TermVectorComponentTest extends SolrTestCaseJ4 {
NamedList values = rsp.getValues();
NamedList termVectors = (NamedList) values.get(TermVectorComponent.TERM_VECTORS);
assertTrue("termVectors is null and it shouldn't be", termVectors != null);
// System.out.println("TVs:" + termVectors);
System.out.println("TVs:" + termVectors);
NamedList doc = (NamedList) termVectors.getVal(0);
assertTrue("doc is null and it shouldn't be", doc != null);
assertTrue(doc.size() + " does not equal: " + 2, doc.size() == 2);
assertEquals(doc.size(), 5);
NamedList field = (NamedList) doc.get("test_posofftv");
assertTrue("field is null and it shouldn't be", field != null);
assertTrue(field.size() + " does not equal: " + 2, field.size() == 2);
@ -127,7 +186,7 @@ public class TermVectorComponentTest extends SolrTestCaseJ4 {
// System.out.println("TVs: " + termVectors);
NamedList doc = (NamedList) termVectors.getVal(0);
assertTrue("doc is null and it shouldn't be", doc != null);
assertTrue(doc.size() + " does not equal: " + 2, doc.size() == 2);
assertEquals(doc.size(), 5);
NamedList offtv = (NamedList) doc.get("test_posofftv");
assertTrue("offtv is null and it shouldn't be", offtv != null);
assertTrue("offtv Size: " + offtv.size() + " is not: " + 2, offtv.size() == 2);
@ -144,8 +203,114 @@ public class TermVectorComponentTest extends SolrTestCaseJ4 {
Double tfIdf = (Double) another.get("tf-idf");
assertTrue("tfIdf is null and it shouldn't be", tfIdf != null);
assertTrue(tfIdf + " does not equal: " + 0.5, tfIdf == 0.5);
}
/*
<field name="test_basictv" type="text" termVectors="true"/>
<field name="test_notv" type="text" termVectors="false"/>
<field name="test_postv" type="text" termVectors="true" termPositions="true"/>
<field name="test_offtv" type="text" termVectors="true" termOffsets="true"/>
<field name="test_posofftv" type="text" termVectors="true"
termPositions="true" termOffsets="true"/>
*/
@Test
public void testPerField() throws Exception {
SolrCore core = h.getCore();
SearchComponent tvComp = core.getSearchComponent("tvComponent");
assertTrue("tvComp is null and it shouldn't be", tvComp != null);
ModifiableSolrParams params = new ModifiableSolrParams();
params.add(CommonParams.Q, "id:0");
params.add(CommonParams.QT, "tvrh");
params.add(TermVectorParams.FIELDS, "test_basictv,test_notv,test_postv,test_offtv,test_posofftv");
params.add(TermVectorParams.TF, "true");
params.add(TermVectorParams.DF, "true");
params.add(TermVectorParams.OFFSETS, "true");
params.add(TermVectorParams.POSITIONS, "true");
params.add(TermVectorParams.TF_IDF, "true");
params.add(TermVectorComponent.COMPONENT_NAME, "true");
//per field
params.add("f.test_posofftv." + TermVectorParams.POSITIONS, "false");
params.add("f.test_offtv." + TermVectorParams.OFFSETS, "false");
params.add("f.test_basictv." + TermVectorParams.DF, "false");
params.add("f.test_basictv." + TermVectorParams.TF, "false");
params.add("f.test_basictv." + TermVectorParams.TF_IDF, "false");
SolrRequestHandler handler = core.getRequestHandler("tvrh");
SolrQueryResponse rsp;
rsp = new SolrQueryResponse();
rsp.add("responseHeader", new SimpleOrderedMap());
handler.handleRequest(new LocalSolrQueryRequest(core, params), rsp);
NamedList values = rsp.getValues();
NamedList termVectors = (NamedList) values.get(TermVectorComponent.TERM_VECTORS);
assertTrue("termVectors is null and it shouldn't be", termVectors != null);
System.out.println("TVs: " + termVectors);
NamedList doc = (NamedList) termVectors.get("doc-0");
assertTrue("doc is null and it shouldn't be", doc != null);
assertEquals(doc.size(), 5);
NamedList vec;
NamedList another;
NamedList offsets;
NamedList pos;
Integer df;
Double val;
vec = (NamedList) doc.get("test_posofftv");
assertNotNull(vec);
assertEquals(vec.size(), 2);
another = (NamedList) vec.get("anoth");
offsets = (NamedList) another.get("offsets");
assertNotNull(offsets);
assertTrue(offsets.size() > 0);
pos = (NamedList) another.get("positions");
//positions should be null, since we turned them off
assertNull(pos);
df = (Integer) another.get("df");
assertNotNull(df);
assertTrue(df == 2);
val = (Double) another.get("tf-idf");
assertTrue("tfIdf is null and it shouldn't be", val != null);
assertTrue(val + " does not equal: " + 0.5, val == 0.5);
//Try out the other fields, too
vec = (NamedList) doc.get("test_offtv");
assertNotNull(vec);
assertEquals(vec.size(), 2);
another = (NamedList) vec.get("anoth");
offsets = (NamedList) another.get("offsets");
assertNull(offsets);
pos = (NamedList) another.get("positions");
//positions should be null, since we turned them off
assertNull(vec.toString(), pos);
df = (Integer) another.get("df");
assertNotNull(df);
assertTrue(df == 2);
val = (Double) another.get("tf-idf");
assertTrue("tfIdf is null and it shouldn't be", val != null);
assertTrue(val + " does not equal: " + 0.5, val == 0.5);
vec = (NamedList) doc.get("test_basictv");
assertNotNull(vec);
assertEquals(vec.size(), 2);
another = (NamedList) vec.get("anoth");
offsets = (NamedList) another.get("offsets");
assertNull(offsets);
pos = (NamedList) another.get("positions");
assertNull(pos);
df = (Integer) another.get("df");
assertNull(df);
val = (Double) another.get("tf-idf");
assertNull(val);
val = (Double) another.get("tf");
assertNull(val);
//Now validate we have error messages
NamedList warnings = (NamedList) termVectors.get("warnings");
assertNotNull(warnings);
List<String> theList;
theList = (List<String>) warnings.get("noTermVectors");
assertNotNull(theList);
assertEquals(theList.size(), 1);
theList = (List<String>) warnings.get("noPositions");
assertNotNull(theList);
assertEquals(theList.size(), 2);
theList = (List<String>) warnings.get("noOffsets");
assertNotNull(theList);
assertEquals(theList.size(), 2);
}
@Test
@ -165,14 +330,14 @@ public class TermVectorComponentTest extends SolrTestCaseJ4 {
rsp = new SolrQueryResponse();
rsp.add("responseHeader", new SimpleOrderedMap());
handler.handleRequest(new LocalSolrQueryRequest(core, params), rsp);
NamedList values = rsp.getValues();
NamedList termVectors = (NamedList) values.get(TermVectorComponent.TERM_VECTORS);
assertTrue("termVectors is null and it shouldn't be", termVectors != null);
NamedList doc = (NamedList) termVectors.getVal(0);
assertTrue("doc is null and it shouldn't be", doc != null);
assertTrue(doc.size() + " does not equal: " + 1, doc.size() == 1);
Exception exception = rsp.getException();
assertNotNull(exception);
}
@Test
public void testDistributed() throws Exception {
SolrCore core = h.getCore();