SOLR-11891: DocStreamer now respects the ReturnFields when populating a SolrDocument

This is an optimization that reduces the number of unneccessary fields a ResponseWriter will see if documentCache is used

This commit also includes fixes for SOLR-12107 & SOLR-12108 -- two bugs that were previously dependent on the
un-optimized behavior of DocStreamer in order to function properly.

- SOLR-12107: Fixed a error in [child] transformer that could ocur if documentCache was not used
- SOLR-12108: Fixed the fallback behavior of [raw] and [xml] transformers when an incompatble 'wt' was specified,
  the field value was lost if documentCache was not used.
This commit is contained in:
Chris Hostetter 2018-03-19 12:19:31 -07:00
parent d6ed71b5c4
commit 8bd7e5c9d2
7 changed files with 219 additions and 27 deletions

View File

@ -63,6 +63,11 @@ Bug Fixes
* SOLR-12103: Raise CryptoKeys.DEFAULT_KEYPAIR_LENGTH from 1024 to 2048. (Mark Miller)
* SOLR-12107: Fixed a error in [child] transformer that could ocur if documentCache was not used (hossman)
* SOLR-12108: Fixed the fallback behavior of [raw] and [xml] transformers when an incompatble 'wt' was
specified, the field value was lost if documentCache was not used. (hossman)
Optimizations
----------------------
@ -73,6 +78,9 @@ Optimizations
* SOLR-11731: LatLonPointSpatialField can now decode points from docValues when stored=false docValues=true,
albeit with maximum precision of 1.33cm (Karthik Ramachandran, David Smiley)
* SOLR-11891: DocStreamer now respects the ReturnFields when populating a SolrDocument, reducing the
number of unneccessary fields a ResponseWriter will see if documentCache is used (wei wang, hossman)
Other Changes
----------------------

View File

@ -50,6 +50,8 @@ import org.apache.solr.schema.TrieLongField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList;
import org.apache.solr.search.SolrDocumentFetcher;
import org.apache.solr.search.ReturnFields;
import org.apache.solr.search.SolrReturnFields;
/**
* This streams SolrDocuments from a DocList and applies transformer
@ -97,7 +99,8 @@ public class DocsStreamer implements Iterator<SolrDocument> {
if (retrieveFieldsOptimizer.returnStoredFields()) {
Document doc = docFetcher.doc(id, retrieveFieldsOptimizer.getStoredFields());
// make sure to use the schema from the searcher and not the request (cross-core)
sdoc = convertLuceneDocToSolrDoc(doc, rctx.getSearcher().getSchema());
sdoc = convertLuceneDocToSolrDoc(doc, rctx.getSearcher().getSchema(),
rctx.getReturnFields());
} else {
// no need to get stored fields of the document, see SOLR-5968
sdoc = new SolrDocument();
@ -127,28 +130,72 @@ public class DocsStreamer implements Iterator<SolrDocument> {
}
// TODO move to SolrDocumentFetcher ? Refactor to also call docFetcher.decorateDocValueFields(...) ?
/**
* This method is less efficient then the 3 arg version because it may convert some fields that
* are not needed
*
* @deprecated use the 3 arg version for better performance
* @see #convertLuceneDocToSolrDoc(Document,IndexSchema,ReturnFields)
*/
@Deprecated
public static SolrDocument convertLuceneDocToSolrDoc(Document doc, final IndexSchema schema) {
SolrDocument out = new SolrDocument();
return convertLuceneDocToSolrDoc(doc,schema, new SolrReturnFields());
}
/**
* Converts the specified <code>Document</code> into a <code>SolrDocument</code>.
* <p>
* The use of {@link ReturnFields} can be important even when it was already used to retrieve the
* {@link Document} from {@link SolrDocumentFetcher} because the Document may have been cached with
* more fields then are desired.
* </p>
*
* @param doc <code>Document</code> to be converted, must not be null
* @param schema <code>IndexSchema</code> containing the field/fieldType details for the index
* the <code>Document</code> came from, must not be null.
* @param fields <code>ReturnFields</code> instance that can be use to limit the set of fields
* that will be converted, must not be null
*/
public static SolrDocument convertLuceneDocToSolrDoc(Document doc,
final IndexSchema schema,
final ReturnFields fields) {
// TODO move to SolrDocumentFetcher ? Refactor to also call docFetcher.decorateDocValueFields(...) ?
assert null != doc;
assert null != schema;
assert null != fields;
// can't just use fields.wantsField(String)
// because that doesn't include extra fields needed by transformers
final Set<String> fieldNamesNeeded = fields.getLuceneFieldNames();
final SolrDocument out = new SolrDocument();
// NOTE: it would be tempting to try and optimize this to loop over fieldNamesNeeded
// when it's smaller then the IndexableField[] in the Document -- but that's actually *less* effecient
// since Document.getFields(String) does a full (internal) iteration over the full IndexableField[]
// see SOLR-11891
for (IndexableField f : doc.getFields()) {
// Make sure multivalued fields are represented as lists
Object existing = out.get(f.name());
if (existing == null) {
SchemaField sf = schema.getFieldOrNull(f.name());
if (sf != null && sf.multiValued()) {
List<Object> vals = new ArrayList<>();
vals.add(f);
out.setField(f.name(), vals);
final String fname = f.name();
if (null == fieldNamesNeeded || fieldNamesNeeded.contains(fname) ) {
// Make sure multivalued fields are represented as lists
Object existing = out.get(fname);
if (existing == null) {
SchemaField sf = schema.getFieldOrNull(fname);
if (sf != null && sf.multiValued()) {
List<Object> vals = new ArrayList<>();
vals.add(f);
out.setField(fname, vals);
} else {
out.setField(fname, f);
}
} else {
out.setField(f.name(), f);
out.addField(fname, f);
}
} else {
out.addField(f.name(), f);
}
}
return out;
}
@Override
public void remove() { //do nothing
}

View File

@ -152,7 +152,7 @@ public abstract class TextResponseWriter implements PushWriter {
} else if (val instanceof Date) {
writeDate(name, (Date) val);
} else if (val instanceof Document) {
SolrDocument doc = DocsStreamer.convertLuceneDocToSolrDoc((Document) val, schema);
SolrDocument doc = DocsStreamer.convertLuceneDocToSolrDoc((Document) val, schema, returnFields);
writeSolrDocument(name, doc, returnFields, 0);
} else if (val instanceof SolrDocument) {
writeSolrDocument(name, (SolrDocument) val, returnFields, 0);

View File

@ -41,6 +41,7 @@ import org.apache.solr.search.QParser;
import org.apache.solr.search.QueryWrapperFilter;
import org.apache.solr.search.SyntaxError;
import org.apache.solr.search.SolrDocumentFetcher;
import org.apache.solr.search.SolrReturnFields;
/**
*
@ -121,6 +122,12 @@ class ChildDocTransformer extends DocTransformer {
public String getName() {
return name;
}
@Override
public String[] getExtraRequestFields() {
// we always need the idField (of the parent) in order to fill out it's children
return new String[] { idField.getName() };
}
@Override
public void transform(SolrDocument doc, int docid) {
@ -146,16 +153,16 @@ class ChildDocTransformer extends DocTransformer {
while(i.hasNext()) {
Integer childDocNum = i.next();
Document childDoc = context.getSearcher().doc(childDocNum);
SolrDocument solrChildDoc = DocsStreamer.convertLuceneDocToSolrDoc(childDoc, schema);
if (shouldDecorateWithDVs) {
docFetcher.decorateDocValueFields(solrChildDoc, childDocNum, dvFieldsToReturn);
}
// TODO: future enhancement...
// support an fl local param in the transformer, which is used to build
// a private ReturnFields instance that we use to prune unwanted field
// names from solrChildDoc
SolrDocument solrChildDoc = DocsStreamer.convertLuceneDocToSolrDoc(childDoc, schema,
new SolrReturnFields());
if (shouldDecorateWithDVs) {
docFetcher.decorateDocValueFields(solrChildDoc, childDocNum, dvFieldsToReturn);
}
doc.addChildDocument(solrChildDoc);
}
}

View File

@ -83,12 +83,31 @@ public class RawValueTransformerFactory extends TransformerFactory
return new RawTransformer( field, display );
}
if(field.equals(display)) {
return null; // nothing
if (field.equals(display)) {
// we have to ensure the field is returned
return new NoopFieldTransformer(field);
}
return new RenameFieldTransformer( field, display, false );
}
/**
* Trivial Impl that ensure that the specified field is requested as an "extra" field,
* but then does nothing during the transformation phase.
*/
private static final class NoopFieldTransformer extends DocTransformer {
final String field;
public NoopFieldTransformer(String field ) {
this.field = field;
}
public String getName() { return "noop"; }
public String[] getExtraRequestFields() {
return new String[] { field };
}
public void transform(SolrDocument doc, int docid) {
// No-Op
}
}
static class RawTransformer extends DocTransformer
{
final String field;

View File

@ -30,8 +30,13 @@ import org.apache.solr.response.transform.DocTransformer;
public abstract class ReturnFields {
/**
* Set of field names with their exact names from the lucene index.
* <p>
* Class such as ResponseWriters pass this to {@link SolrIndexSearcher#doc(int, Set)}.
* <p>
* NOTE: In some situations, this method may return <code>null</code> even if {@link #wantsAllFields()}
* is <code>false</code>. For example: When glob expressions are used ({@link #hasPatternMatching}),
* it is safer to request all field names then to attempt to resolve the globs against all possible
* dynamic field names in the index.
* </p>
* @return Set of field names or <code>null</code> (all fields).
*/
public abstract Set<String> getLuceneFieldNames();
@ -60,10 +65,17 @@ public abstract class ReturnFields {
*/
public abstract Map<String,String> getFieldRenames();
/** Returns <code>true</code> if the specified field should be returned. */
/**
* Returns <code>true</code> if the specified field should be returned <em>to the external client</em>
* -- either using it's own name, or via an alias.
* This method returns <code>false</code> even if the specified name is needed as an "extra" field
* for use by transformers.
*/
public abstract boolean wantsField(String name);
/** Returns <code>true</code> if all fields should be returned. */
/**
* Returns <code>true</code> if all fields should be returned <em>to the external client</em>.
*/
public abstract boolean wantsAllFields();
/** Returns <code>true</code> if the score should be returned. */

View File

@ -17,13 +17,25 @@
package org.apache.solr.search;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.document.Document;
import static org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.response.transform.*;
import static org.apache.solr.response.DocsStreamer.convertLuceneDocToSolrDoc;
import org.apache.solr.schema.IndexSchema;
import org.junit.BeforeClass;
import org.junit.Test;
import java.lang.reflect.Method;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Locale;
import java.util.List;
import java.util.Random;
public class ReturnFieldsTest extends SolrTestCaseJ4 {
@ -363,6 +375,93 @@ public class ReturnFieldsTest extends SolrTestCaseJ4 {
}
/**
* Whitebox verification that the conversion from lucene {@link Document} to {@link SolrDocument} respects
* the {@link ReturnFields} and doesn't unneccessarily convert Fields that aren't needed.
* <p>
* This is important because {@link SolrDocumentFetcher} may return additional fields
* (lazy or otherwise) if the document has been cached.
* </p>
*/
public void testWhiteboxSolrDocumentConversion() {
final IndexSchema schema = h.getCore().getLatestSchema();
SolrDocument docOut = null;
// a "mock" Document with a bunch of fields...
//
// (we can mock this with all StringField instances because convertLuceneDocToSolrDoc only
// uses the schema for multivalued-ness)
final Document docIn = new Document();
final StringBuilder allFieldNames = new StringBuilder();
docIn.add(new StringField("id","bar",Store.YES));
allFieldNames.append("id");
docIn.add(new StringField("store_rpt","42",Store.YES));
allFieldNames.append(",store_rpt");
docIn.add(new StringField("subword","bar",Store.YES)); // single value in multi-value field
allFieldNames.append(",subword");
docIn.add(new StringField("uniq","xxx",Store.YES));
docIn.add(new StringField("uniq","yyy",Store.YES)); // multi-value in multi-valued field
allFieldNames.append(",uniq");
for (int i = 0; i < 20; i++) {
final String foo = "foo_" + i + "_s1";
allFieldNames.append(",").append(foo);
docIn.add(new StringField(foo, "bar"+i, Store.YES));
}
// output should only have a single field
docOut = convertLuceneDocToSolrDoc(docIn, schema, new SolrReturnFields(req("fl","id")));
assertEquals(docOut.toString(), 1, docOut.size());
assertEquals(docOut.toString(),
Collections.singleton("id"),
docOut.getFieldNames());
assertTrue(docOut.toString(), docOut.get("id") instanceof StringField);
// output should only have the few specified fields
// behavior should be ultimately be consistent for all of these ReturnField instances
// (aliasing, extra requested by transformer, or otherwise)
for (ReturnFields rf : Arrays.asList
(new SolrReturnFields(req("fl","id,subword,store_rpt,uniq,foo_2_s1")),
new SolrReturnFields(req("fl","id,xxx:[geo f=store_rpt],uniq,foo_2_s1,subword")),
new SolrReturnFields(req("fl","id,xxx:subword,uniq,yyy:foo_2_s1,[geo f=store_rpt]")))) {
docOut = convertLuceneDocToSolrDoc(docIn, schema, rf);
final String debug = rf.toString() + " => " +docOut.toString();
assertEquals(debug, 5, docOut.size());
assertEquals(debug,
new HashSet<String>(Arrays.asList("id","subword","uniq","foo_2_s1","store_rpt")),
docOut.getFieldNames());
assertTrue(debug, docOut.get("id") instanceof StringField);
assertTrue(debug, docOut.get("store_rpt") instanceof StringField);
assertTrue(debug, docOut.get("foo_2_s1") instanceof StringField);
assertTrue(debug, docOut.get("subword") instanceof List);
assertTrue(debug, docOut.get("uniq") instanceof List);
}
// all source fields should be in the output
// behavior should be ultimately be consistent for all of these ReturnField instances
// (globbing or requesting more fields then doc has)
for (ReturnFields rf : Arrays.asList
(new SolrReturnFields(),
new SolrReturnFields(req()),
new SolrReturnFields(req("fl","*")),
new SolrReturnFields(req("fl","*,score")),
new SolrReturnFields(req("fl","id,subword,uniq,foo_*,store_*")),
new SolrReturnFields(req("fl",allFieldNames+",bogus1,bogus2,bogus3")))) {
docOut = convertLuceneDocToSolrDoc(docIn, schema, rf);
final String debug = rf.toString() + " => " +docOut.toString();
assertEquals(debug, 24, docOut.size());
assertTrue(debug, docOut.get("id") instanceof StringField);
assertTrue(debug, docOut.get("store_rpt") instanceof StringField);
assertTrue(debug, docOut.get("subword") instanceof List);
assertTrue(debug, docOut.get("uniq") instanceof List);
for (int i = 0; i < 20; i++) {
assertTrue(debug, docOut.get("foo_" + i + "_s1") instanceof StringField);
}
}
}
public void testWhitespace() {
Random r = random();
final int iters = atLeast(30);