SOLR-10273: DocumentBuilder move longest field to last position

This commit is contained in:
David Smiley 2017-03-16 21:22:08 -04:00 committed by Shalin Shekhar Mangar
parent 1ffd7604ee
commit 8100cbfdf1
3 changed files with 130 additions and 1 deletions

View File

@ -301,6 +301,10 @@ Optimizations
* SOLR-10143: PointFields will create IndexOrDocValuesQuery when a field is both, indexed=true and docValues=true
(Tomás Fernández Löbbe)
* SOLR-10273: The field with the longest value (if it exceeds 4K) is moved to be last in the Lucene Document in order
to benefit from stored field optimizations in Lucene that can avoid reading it when it's not needed. If the field is
multi-valued, they all move together to the end to retain order. (David Smiley)
Other Changes
----------------------
* SOLR-9980: Expose configVersion in core admin status (Jessica Cheng Mallet via Tomás Fernández Löbbe)

View File

@ -16,12 +16,15 @@
*/
package org.apache.solr.update;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
@ -33,10 +36,13 @@ import org.apache.solr.schema.SchemaField;
import com.google.common.collect.Sets;
/**
*
* Builds a Lucene {@link Document} from a {@link SolrInputDocument}.
*/
public class DocumentBuilder {
// accessible only for tests
static int MIN_LENGTH_TO_MOVE_LAST = Integer.getInteger("solr.docBuilder.minLengthToMoveLast", 4*1024); // internal setting
/**
* Add a field value to a given document.
* @param doc Document that the field needs to be added to
@ -227,6 +233,58 @@ public class DocumentBuilder {
}
}
}
if (!forInPlaceUpdate) {
moveLargestFieldLast(out);
}
return out;
}
/** Move the largest stored field last, because Lucene can avoid loading that one if it's not needed. */
private static void moveLargestFieldLast(Document doc) {
String largestField = null;
int largestFieldLen = -1;
boolean largestIsLast = true;
for (IndexableField field : doc) {
if (!field.fieldType().stored()) {
continue;
}
if (largestIsLast && !field.name().equals(largestField)) {
largestIsLast = false;
}
if (field.numericValue() != null) { // just ignore these as non-competitive (avoid toString'ing their number)
continue;
}
String strVal = field.stringValue();
if (strVal != null) {
if (strVal.length() > largestFieldLen) {
largestField = field.name();
largestFieldLen = strVal.length();
largestIsLast = true;
}
} else {
BytesRef bytesRef = field.binaryValue();
if (bytesRef != null && bytesRef.length > largestFieldLen) {
largestField = field.name();
largestFieldLen = bytesRef.length;
largestIsLast = true;
}
}
}
if (!largestIsLast && largestField != null && largestFieldLen > MIN_LENGTH_TO_MOVE_LAST) { // only bother if the value isn't tiny
LinkedList<IndexableField> addToEnd = new LinkedList<>();
Iterator<IndexableField> iterator = doc.iterator();
while (iterator.hasNext()) {
IndexableField field = iterator.next();
if (field.name().equals(largestField)) {
addToEnd.add(field);
iterator.remove(); // Document may not have "remove" but it's iterator allows mutation
}
}
for (IndexableField field : addToEnd) {
doc.add(field);
}
}
}
}

View File

@ -16,7 +16,14 @@
*/
package org.apache.solr.update;
import java.util.Iterator;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
import com.carrotsearch.randomizedtesting.generators.RandomStrings;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.util.TestUtil;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.SolrDocument;
@ -25,6 +32,8 @@ import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.FieldType;
import org.junit.After;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
@ -33,12 +42,23 @@ import org.junit.Test;
*
*/
public class DocumentBuilderTest extends SolrTestCaseJ4 {
static final int save_min_len = DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST;
@BeforeClass
public static void beforeClass() throws Exception {
initCore("solrconfig.xml", "schema.xml");
}
@AfterClass
public static void afterClass() {
DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST = save_min_len;
}
@After
public void afterTest() {
DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST = save_min_len;
}
@Test
public void testBuildDocument() throws Exception
{
@ -222,7 +242,54 @@ public class DocumentBuilderTest extends SolrTestCaseJ4 {
sif2.setName("foo");
assertFalse(assertSolrInputFieldEquals(sif1, sif2));
}
public void testMoveLargestLast() {
SolrInputDocument inDoc = new SolrInputDocument();
String TEXT_FLD = "text"; // not stored. It won't be moved. This value is the longest, however.
inDoc.addField(TEXT_FLD,
"NOT STORED|" + RandomStrings.randomAsciiOfLength(random(), 4 * DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST));
String CAT_FLD = "cat"; // stored, multiValued
inDoc.addField(CAT_FLD,
"STORED V1|");
// pretty long value
inDoc.addField(CAT_FLD,
"STORED V2|" + RandomStrings.randomAsciiOfLength(random(), 2 * DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST));
inDoc.addField(CAT_FLD,
"STORED V3|" + RandomStrings.randomAsciiOfLength(random(), DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST));
String SUBJECT_FLD = "subject"; // stored. This value is long, but not long enough.
inDoc.addField(SUBJECT_FLD,
"2ndplace|" + RandomStrings.randomAsciiOfLength(random(), DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST));
Document outDoc = DocumentBuilder.toDocument(inDoc, h.getCore().getLatestSchema());
// filter outDoc by stored fields; convert to list.
List<IndexableField> storedFields = StreamSupport.stream(outDoc.spliterator(), false)
.filter(f -> f.fieldType().stored()).collect(Collectors.toList());
// clip to last 3. We expect these to be for CAT_FLD
storedFields = storedFields.subList(storedFields.size() - 3, storedFields.size());
Iterator<IndexableField> fieldIterator = storedFields.iterator();
IndexableField field;
// Test that we retained the particular value ordering, even though though the 2nd of three was longest
assertTrue(fieldIterator.hasNext());
field = fieldIterator.next();
assertEquals(CAT_FLD, field.name());
assertTrue(field.stringValue().startsWith("STORED V1|"));
assertTrue(fieldIterator.hasNext());
field = fieldIterator.next();
assertEquals(CAT_FLD, field.name());
assertTrue(field.stringValue().startsWith("STORED V2|"));
assertTrue(fieldIterator.hasNext());
field = fieldIterator.next();
assertEquals(CAT_FLD, field.name());
assertTrue(field.stringValue().startsWith("STORED V3|"));
}
}