SOLR-11770: NPE in tvrh if no field is specified and document doesn't contain any fields with term vectors

This commit is contained in:
Erick Erickson 2018-08-06 20:04:59 -07:00
parent 896fd0ebd5
commit 96e8392921
7 changed files with 304 additions and 217 deletions

View File

@ -212,6 +212,8 @@ Bug Fixes
This would fix the NPE when using the search stream with partitionKeys. (Varun Thacker)
* SOLR-11770: NPE in tvrh if no field is specified and document doesn't contain any fields with term vectors
Optimizations
----------------------

View File

@ -17,7 +17,6 @@
package org.apache.solr.handler.component;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
@ -28,15 +27,16 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.SolrParams;
@ -44,11 +44,14 @@ import org.apache.solr.common.params.TermVectorParams;
import org.apache.solr.common.util.Base64;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.response.DocsStreamer;
import org.apache.solr.response.RetrieveFieldsOptimizer;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocList;
import org.apache.solr.search.DocListAndSet;
import org.apache.solr.search.ReturnFields;
import org.apache.solr.search.SolrDocumentFetcher;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.search.SolrReturnFields;
import org.apache.solr.util.SolrPluginUtils;
@ -262,54 +265,49 @@ public class TermVectorComponent extends SearchComponent implements SolrCoreAwar
//Only load the id field to get the uniqueKey of that
//field
final String finalUniqFieldName = uniqFieldName;
final List<String> uniqValues = new ArrayList<>();
// TODO: is this required to be single-valued? if so, we should STOP
// once we find it...
final StoredFieldVisitor getUniqValue = new StoredFieldVisitor() {
@Override
public void stringField(FieldInfo fieldInfo, byte[] bytes) {
uniqValues.add(new String(bytes, StandardCharsets.UTF_8));
}
@Override
public void intField(FieldInfo fieldInfo, int value) {
uniqValues.add(Integer.toString(value));
}
@Override
public void longField(FieldInfo fieldInfo, long value) {
uniqValues.add(Long.toString(value));
}
@Override
public Status needsField(FieldInfo fieldInfo) {
return (fieldInfo.name.equals(finalUniqFieldName)) ? Status.YES : Status.NO;
}
};
SolrDocumentFetcher docFetcher = searcher.getDocFetcher();
SolrReturnFields srf = new SolrReturnFields(uniqFieldName, rb.req);
RetrieveFieldsOptimizer retrieveFieldsOptimizer = RetrieveFieldsOptimizer.create(docFetcher, srf);
while (iter.hasNext()) {
Integer docId = iter.next();
NamedList<Object> docNL = new NamedList<>();
if (keyField != null) {
reader.document(docId, getUniqValue);
String uniqVal = null;
if (uniqValues.size() != 0) {
uniqVal = uniqValues.get(0);
uniqValues.clear();
SolrDocument sdoc = null;
try {
if (retrieveFieldsOptimizer.returnStoredFields()) {
Document doc = docFetcher.doc(docId, retrieveFieldsOptimizer.getStoredFields());
// make sure to use the schema from the searcher and not the request (cross-core)
sdoc = DocsStreamer.convertLuceneDocToSolrDoc(doc, searcher.getSchema(), srf);
} else {
// no need to get stored fields of the document, see SOLR-5968
sdoc = new SolrDocument();
}
// decorate the document with non-stored docValues fields
if (retrieveFieldsOptimizer.returnDVFields()) {
docFetcher.decorateDocValueFields(sdoc, docId, retrieveFieldsOptimizer.getDvFields());
}
} catch (IOException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Error reading document with docId " + docId, e);
}
Object val = sdoc.getFieldValue(uniqFieldName);
String uniqVal = "";
if (val instanceof StoredField) {
uniqVal = ((StoredField) val).stringValue();
} else {
uniqVal = val.toString();
}
docNL.add("uniqueKey", uniqVal);
termVectors.add(uniqVal, docNL);
}
} else {
// support for schemas w/o a unique key,
termVectors.add("doc-" + docId, docNL);
}
if ( null != fields ) {
if (null != fields) {
for (Map.Entry<String, FieldOptions> entry : fieldOptions.entrySet()) {
final String field = entry.getKey();
final Terms vector = reader.getTermVector(docId, field);
@ -321,6 +319,8 @@ public class TermVectorComponent extends SearchComponent implements SolrCoreAwar
} else {
// extract all fields
final Fields vectors = reader.getTermVectors(docId);
// There can be no documents with vectors
if (vectors != null) {
for (String field : vectors) {
Terms terms = vectors.terms(field);
if (terms != null) {
@ -331,6 +331,7 @@ public class TermVectorComponent extends SearchComponent implements SolrCoreAwar
}
}
}
}
private void mapOneVector(NamedList<Object> docNL, FieldOptions fieldOptions, IndexReader reader, int docID, TermsEnum termsEnum, String field) throws IOException {
NamedList<Object> fieldNL = new NamedList<>();

View File

@ -53,19 +53,19 @@ public class RetrieveFieldsOptimizer {
storedFields.clear();
}
boolean returnStoredFields() {
public boolean returnStoredFields() {
return !(storedFields != null && storedFields.isEmpty());
}
boolean returnDVFields() {
public boolean returnDVFields() {
return !dvFields.isEmpty();
}
Set<String> getStoredFields() {
public Set<String> getStoredFields() {
return storedFields;
}
Set<String> getDvFields() {
public Set<String> getDvFields() {
return dvFields;
}

View File

@ -73,6 +73,8 @@ public abstract class BaseEditorialTransformer extends DocTransformer {
ft.readableToIndexed(f.stringValue(), bytesRefBuilder);
}
return bytesRefBuilder.get();
} else if (obj instanceof String) { // Allows the idField to be stored=false, docValues=true
return new BytesRef(((String)obj));
}
throw new AssertionError("Expected an IndexableField but got: " + obj.getClass());
}

View File

@ -502,7 +502,7 @@
</fieldType>
<fieldType name="severityType" class="${solr.tests.EnumFieldType}" enumsConfig="enumsConfig.xml" enumName="severity"/>
<field name="id" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
<field name="id" type="string" indexed="true" stored="${solr.tests.id.stored:true}" multiValued="false" required="false" docValues="${solr.tests.id.docValues:false}"/>
<field name="_root_" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
<field name="signatureField" type="string" indexed="true" stored="false"/>

View File

@ -23,6 +23,7 @@ import java.io.PrintWriter;
import java.lang.invoke.MethodHandles;
import java.nio.charset.StandardCharsets;
import com.carrotsearch.randomizedtesting.rules.SystemPropertiesRestoreRule;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.SolrTestCaseJ4;
@ -36,14 +37,43 @@ import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.FileUtils;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.RuleChain;
import org.junit.rules.TestRule;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class QueryElevationComponentTest extends SolrTestCaseJ4 {
@Rule
public TestRule solrTestRules = RuleChain.outerRule(new SystemPropertiesRestoreRule());
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@BeforeClass
public static void beforeClass() throws Exception {
switch (random().nextInt(3)) {
case 0:
System.setProperty("solr.tests.id.stored", "true");
System.setProperty("solr.tests.id.docValues", "true");
break;
case 1:
System.setProperty("solr.tests.id.stored", "true");
System.setProperty("solr.tests.id.docValues", "false");
break;
case 2:
System.setProperty("solr.tests.id.stored", "false");
System.setProperty("solr.tests.id.docValues", "true");
break;
default:
fail("Bad random number generatged not between 0-2 iunclusive");
break;
}
}
@Before
@Override
public void setUp() throws Exception {

View File

@ -20,19 +20,54 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import com.carrotsearch.randomizedtesting.rules.SystemPropertiesRestoreRule;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.params.TermVectorParams;
import org.junit.BeforeClass;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.RuleChain;
import org.junit.rules.TestRule;
/**
*
*
**/
public class TermVectorComponentTest extends SolrTestCaseJ4 {
@Rule
public TestRule solrTestRules = RuleChain.outerRule(new SystemPropertiesRestoreRule());
// ensure that we operate correctly with all valid combinations of the uniqueKey being
// stored and/or in docValues.
@BeforeClass
public static void beforeClass() throws Exception {
initCore("solrconfig.xml","schema.xml");
switch (random().nextInt(3)) {
case 0:
System.setProperty("solr.tests.id.stored", "true");
System.setProperty("solr.tests.id.docValues", "true");
break;
case 1:
System.setProperty("solr.tests.id.stored", "true");
System.setProperty("solr.tests.id.docValues", "false");
break;
case 2:
System.setProperty("solr.tests.id.stored", "false");
System.setProperty("solr.tests.id.docValues", "true");
break;
default:
fail("Bad random number generatged not between 0-2 iunclusive");
break;
}
initCore("solrconfig.xml", "schema.xml");
}
static String tv = "/tvrh";
@Test
public void testCanned() throws Exception {
clearIndex();
assertU(adoc("id", "0",
"test_posoffpaytv", "This is a title and another title",
"test_posofftv", "This is a title and another title",
@ -116,14 +151,16 @@ public class TermVectorComponentTest extends SolrTestCaseJ4 {
));
assertNull(h.validateUpdate(commit()));
doBasics();
doOptions();
doPerField();
doPayloads();
}
static String tv = "/tvrh";
@Test
public void testBasics() throws Exception {
assertJQ(req("json.nl","map", "qt",tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true", TermVectorParams.TF, "true")
,"/termVectors=={'0':{'uniqueKey':'0'," +
private void doBasics() throws Exception {
assertJQ(req("json.nl", "map", "qt", tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true", TermVectorParams.TF, "true")
, "/termVectors=={'0':{'uniqueKey':'0'," +
" 'test_basictv':{'anoth':{'tf':1},'titl':{'tf':2}}," +
" 'test_offtv':{'anoth':{'tf':1},'titl':{'tf':2}}," +
" 'test_posofftv':{'anoth':{'tf':1},'titl':{'tf':2}}," +
@ -131,38 +168,38 @@ public class TermVectorComponentTest extends SolrTestCaseJ4 {
" 'test_postv':{'anoth':{'tf':1},'titl':{'tf':2}}}}"
);
// tv.fl diff from fl
assertJQ(req("json.nl","map",
"qt",tv,
assertJQ(req("json.nl", "map",
"qt", tv,
"q", "id:0",
"fl", "*,score",
"tv.fl", "test_basictv,test_offtv",
TermVectorComponent.COMPONENT_NAME, "true",
TermVectorParams.TF, "true")
,"/termVectors=={'0':{'uniqueKey':'0'," +
, "/termVectors=={'0':{'uniqueKey':'0'," +
" 'test_basictv':{'anoth':{'tf':1},'titl':{'tf':2}}," +
" 'test_offtv':{'anoth':{'tf':1},'titl':{'tf':2}}}}"
);
// multi-valued tv.fl
assertJQ(req("json.nl","map",
"qt",tv,
assertJQ(req("json.nl", "map",
"qt", tv,
"q", "id:0",
"fl", "*,score",
"tv.fl", "test_basictv",
"tv.fl","test_offtv",
"tv.fl", "test_offtv",
TermVectorComponent.COMPONENT_NAME, "true",
TermVectorParams.TF, "true")
,"/termVectors=={'0':{'uniqueKey':'0'," +
, "/termVectors=={'0':{'uniqueKey':'0'," +
" 'test_basictv':{'anoth':{'tf':1},'titl':{'tf':2}}," +
" 'test_offtv':{'anoth':{'tf':1},'titl':{'tf':2}}}}"
);
// re-use fl glob
assertJQ(req("json.nl","map",
"qt",tv,
assertJQ(req("json.nl", "map",
"qt", tv,
"q", "id:0",
"fl", "*,score",
TermVectorComponent.COMPONENT_NAME, "true",
TermVectorParams.TF, "true")
,"/termVectors=={'0':{'uniqueKey':'0'," +
, "/termVectors=={'0':{'uniqueKey':'0'," +
" 'test_basictv':{'anoth':{'tf':1},'titl':{'tf':2}}," +
" 'test_offtv':{'anoth':{'tf':1},'titl':{'tf':2}}," +
" 'test_posofftv':{'anoth':{'tf':1},'titl':{'tf':2}}," +
@ -170,51 +207,50 @@ public class TermVectorComponentTest extends SolrTestCaseJ4 {
" 'test_postv':{'anoth':{'tf':1},'titl':{'tf':2}}}}"
);
// re-use fl, ignore things we can't handle
assertJQ(req("json.nl","map",
"qt",tv,
assertJQ(req("json.nl", "map",
"qt", tv,
"q", "id:0",
"fl", "score,test_basictv,[docid],test_postv,val:sum(3,4)",
TermVectorComponent.COMPONENT_NAME, "true",
TermVectorParams.TF, "true")
,"/termVectors=={'0':{'uniqueKey':'0'," +
, "/termVectors=={'0':{'uniqueKey':'0'," +
" 'test_basictv':{'anoth':{'tf':1},'titl':{'tf':2}}," +
" 'test_postv':{'anoth':{'tf':1},'titl':{'tf':2}}}}"
);
// re-use (multi-valued) fl, ignore things we can't handle
assertJQ(req("json.nl","map",
"qt",tv,
assertJQ(req("json.nl", "map",
"qt", tv,
"q", "id:0",
"fl", "score,test_basictv",
"fl", "[docid],test_postv,val:sum(3,4)",
TermVectorComponent.COMPONENT_NAME, "true",
TermVectorParams.TF, "true")
,"/termVectors=={'0':{'uniqueKey':'0'," +
, "/termVectors=={'0':{'uniqueKey':'0'," +
" 'test_basictv':{'anoth':{'tf':1},'titl':{'tf':2}}," +
" 'test_postv':{'anoth':{'tf':1},'titl':{'tf':2}}}}"
);
}
@Test
public void testOptions() throws Exception {
assertJQ(req("json.nl","map", "qt",tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true"
private void doOptions() throws Exception {
assertJQ(req("json.nl", "map", "qt", tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true"
, TermVectorParams.TF, "true", TermVectorParams.DF, "true", TermVectorParams.OFFSETS, "true", TermVectorParams.POSITIONS, "true", TermVectorParams.TF_IDF, "true")
,"/termVectors/0/test_posofftv/anoth=={'tf':1, 'offsets':{'start':20, 'end':27}, 'positions':{'position':5}, 'df':2, 'tf-idf':0.5}"
, "/termVectors/0/test_posofftv/anoth=={'tf':1, 'offsets':{'start':20, 'end':27}, 'positions':{'position':5}, 'df':2, 'tf-idf':0.5}"
);
assertJQ(req("json.nl","map", "qt",tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true"
assertJQ(req("json.nl", "map", "qt", tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true"
, TermVectorParams.ALL, "true")
,"/termVectors/0/test_posofftv/anoth=={'tf':1, 'offsets':{'start':20, 'end':27}, 'positions':{'position':5}, 'df':2, 'tf-idf':0.5}"
, "/termVectors/0/test_posofftv/anoth=={'tf':1, 'offsets':{'start':20, 'end':27}, 'positions':{'position':5}, 'df':2, 'tf-idf':0.5}"
);
// test each combination at random
final List<String> list = new ArrayList<>();
list.addAll(Arrays.asList("json.nl","map", "qt",tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true"));
String[][] options = new String[][] { { TermVectorParams.TF, "'tf':1" },
{ TermVectorParams.OFFSETS, "'offsets':{'start':20, 'end':27}" },
{ TermVectorParams.POSITIONS, "'positions':{'position':5}" },
{ TermVectorParams.DF, "'df':2" },
{ TermVectorParams.TF_IDF, "'tf-idf':0.5" } };
list.addAll(Arrays.asList("json.nl", "map", "qt", tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true"));
String[][] options = new String[][]{{TermVectorParams.TF, "'tf':1"},
{TermVectorParams.OFFSETS, "'offsets':{'start':20, 'end':27}"},
{TermVectorParams.POSITIONS, "'positions':{'position':5}"},
{TermVectorParams.DF, "'df':2"},
{TermVectorParams.TF_IDF, "'tf-idf':0.5"}};
StringBuilder expected = new StringBuilder("/termVectors/0/test_posofftv/anoth=={");
boolean first = true;
for (int i = 0; i < options.length; i++) {
@ -235,36 +271,52 @@ public class TermVectorComponentTest extends SolrTestCaseJ4 {
assertJQ(req(list.toArray(new String[0])), expected.toString());
}
@Test
public void testPerField() throws Exception {
assertJQ(req("json.nl","map", "qt",tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true"
,TermVectorParams.TF, "true", TermVectorParams.DF, "true", TermVectorParams.OFFSETS, "true", TermVectorParams.POSITIONS, "true", TermVectorParams.TF_IDF, "true"
,TermVectorParams.FIELDS, "test_basictv,test_notv,test_postv,test_offtv,test_posofftv,test_posoffpaytv"
,"f.test_posoffpaytv." + TermVectorParams.PAYLOADS, "false"
,"f.test_posofftv." + TermVectorParams.POSITIONS, "false"
,"f.test_offtv." + TermVectorParams.OFFSETS, "false"
,"f.test_basictv." + TermVectorParams.DF, "false"
,"f.test_basictv." + TermVectorParams.TF, "false"
,"f.test_basictv." + TermVectorParams.TF_IDF, "false"
private void doPerField() throws Exception {
assertJQ(req("json.nl", "map", "qt", tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true"
, TermVectorParams.TF, "true", TermVectorParams.DF, "true", TermVectorParams.OFFSETS, "true", TermVectorParams.POSITIONS, "true", TermVectorParams.TF_IDF, "true"
, TermVectorParams.FIELDS, "test_basictv,test_notv,test_postv,test_offtv,test_posofftv,test_posoffpaytv"
, "f.test_posoffpaytv." + TermVectorParams.PAYLOADS, "false"
, "f.test_posofftv." + TermVectorParams.POSITIONS, "false"
, "f.test_offtv." + TermVectorParams.OFFSETS, "false"
, "f.test_basictv." + TermVectorParams.DF, "false"
, "f.test_basictv." + TermVectorParams.TF, "false"
, "f.test_basictv." + TermVectorParams.TF_IDF, "false"
)
,"/termVectors/0/test_basictv=={'anoth':{},'titl':{}}"
,"/termVectors/0/test_postv/anoth=={'tf':1, 'positions':{'position':5}, 'df':2, 'tf-idf':0.5}"
,"/termVectors/0/test_offtv/anoth=={'tf':1, 'df':2, 'tf-idf':0.5}"
,"/termVectors/warnings=={ 'noTermVectors':['test_notv'], 'noPositions':['test_basictv', 'test_offtv'], 'noOffsets':['test_basictv', 'test_postv']}"
, "/termVectors/0/test_basictv=={'anoth':{},'titl':{}}"
, "/termVectors/0/test_postv/anoth=={'tf':1, 'positions':{'position':5}, 'df':2, 'tf-idf':0.5}"
, "/termVectors/0/test_offtv/anoth=={'tf':1, 'df':2, 'tf-idf':0.5}"
, "/termVectors/warnings=={ 'noTermVectors':['test_notv'], 'noPositions':['test_basictv', 'test_offtv'], 'noOffsets':['test_basictv', 'test_postv']}"
);
}
private void doPayloads() throws Exception {
// This field uses TokenOffsetPayloadTokenFilter, which
// stuffs start (20) and end offset (27) into the
// payload:
assertJQ(req("json.nl", "map", "qt", tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true"
, TermVectorParams.TF, "true", TermVectorParams.DF, "true", TermVectorParams.OFFSETS, "true", TermVectorParams.POSITIONS, "true", TermVectorParams.TF_IDF, "true",
TermVectorParams.PAYLOADS, "true")
, "/termVectors/0/test_posoffpaytv/anoth=={'tf':1, 'offsets':{'start':20, 'end':27}, 'positions':{'position':5}, 'payloads':{'payload': 'AAAAFAAAABs='}, 'df':2, 'tf-idf':0.5}"
);
}
@Test
public void testPayloads() throws Exception {
// This field uses TokenOffsetPayloadTokenFilter, which
// stuffs start (20) and end offset (27) into the
// payload:
assertJQ(req("json.nl","map", "qt",tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true"
, TermVectorParams.TF, "true", TermVectorParams.DF, "true", TermVectorParams.OFFSETS, "true", TermVectorParams.POSITIONS, "true", TermVectorParams.TF_IDF, "true",
TermVectorParams.PAYLOADS, "true")
,"/termVectors/0/test_posoffpaytv/anoth=={'tf':1, 'offsets':{'start':20, 'end':27}, 'positions':{'position':5}, 'payloads':{'payload': 'AAAAFAAAABs='}, 'df':2, 'tf-idf':0.5}"
public void testNoVectors() throws Exception {
clearIndex();
assertU(adoc("id", "0"));
assertU(adoc("id", "1"));
assertU(adoc("id", "2"));
assertU(adoc("id", "3"));
assertNull(h.validateUpdate(commit()));
// Kind of an odd test, but we just want to know if we don't generate an NPE when there is nothing to give back in the term vectors.
assertJQ(req("json.nl", "map", "qt", tv, "q", "id:0", TermVectorComponent.COMPONENT_NAME, "true", TermVectorParams.TF, "true")
, "/termVectors=={'0':{'uniqueKey':'0'}}}"
);
}
}