diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 9a5299ccd29..dfe8d93393a 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -301,6 +301,10 @@ Optimizations * SOLR-10143: PointFields will create IndexOrDocValuesQuery when a field is both, indexed=true and docValues=true (Tomás Fernández Löbbe) +* SOLR-10273: The field with the longest value (if it exceeds 4K) is moved to be last in the Lucene Document in order + to benefit from stored field optimizations in Lucene that can avoid reading it when it's not needed. If the field is + multi-valued, they all move together to the end to retain order. (David Smiley) + Other Changes ---------------------- * SOLR-9980: Expose configVersion in core admin status (Jessica Cheng Mallet via Tomás Fernández Löbbe) diff --git a/solr/core/src/java/org/apache/solr/update/DocumentBuilder.java b/solr/core/src/java/org/apache/solr/update/DocumentBuilder.java index e3d20116fae..b97af3bcb05 100644 --- a/solr/core/src/java/org/apache/solr/update/DocumentBuilder.java +++ b/solr/core/src/java/org/apache/solr/update/DocumentBuilder.java @@ -16,12 +16,15 @@ */ package org.apache.solr.update; +import java.util.Iterator; +import java.util.LinkedList; import java.util.List; import java.util.Set; import org.apache.lucene.document.Document; import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.index.IndexableField; +import org.apache.lucene.util.BytesRef; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputField; @@ -33,10 +36,13 @@ import org.apache.solr.schema.SchemaField; import com.google.common.collect.Sets; /** - * + * Builds a Lucene {@link Document} from a {@link SolrInputDocument}. */ public class DocumentBuilder { + // accessible only for tests + static int MIN_LENGTH_TO_MOVE_LAST = Integer.getInteger("solr.docBuilder.minLengthToMoveLast", 4*1024); // internal setting + /** * Add a field value to a given document. * @param doc Document that the field needs to be added to @@ -227,6 +233,58 @@ public class DocumentBuilder { } } } + + if (!forInPlaceUpdate) { + moveLargestFieldLast(out); + } + return out; } + + /** Move the largest stored field last, because Lucene can avoid loading that one if it's not needed. */ + private static void moveLargestFieldLast(Document doc) { + String largestField = null; + int largestFieldLen = -1; + boolean largestIsLast = true; + for (IndexableField field : doc) { + if (!field.fieldType().stored()) { + continue; + } + if (largestIsLast && !field.name().equals(largestField)) { + largestIsLast = false; + } + if (field.numericValue() != null) { // just ignore these as non-competitive (avoid toString'ing their number) + continue; + } + String strVal = field.stringValue(); + if (strVal != null) { + if (strVal.length() > largestFieldLen) { + largestField = field.name(); + largestFieldLen = strVal.length(); + largestIsLast = true; + } + } else { + BytesRef bytesRef = field.binaryValue(); + if (bytesRef != null && bytesRef.length > largestFieldLen) { + largestField = field.name(); + largestFieldLen = bytesRef.length; + largestIsLast = true; + } + } + } + if (!largestIsLast && largestField != null && largestFieldLen > MIN_LENGTH_TO_MOVE_LAST) { // only bother if the value isn't tiny + LinkedList addToEnd = new LinkedList<>(); + Iterator iterator = doc.iterator(); + while (iterator.hasNext()) { + IndexableField field = iterator.next(); + if (field.name().equals(largestField)) { + addToEnd.add(field); + iterator.remove(); // Document may not have "remove" but it's iterator allows mutation + } + } + for (IndexableField field : addToEnd) { + doc.add(field); + } + } + } } diff --git a/solr/core/src/test/org/apache/solr/update/DocumentBuilderTest.java b/solr/core/src/test/org/apache/solr/update/DocumentBuilderTest.java index 2a78d6b7923..03dd17cd7ef 100644 --- a/solr/core/src/test/org/apache/solr/update/DocumentBuilderTest.java +++ b/solr/core/src/test/org/apache/solr/update/DocumentBuilderTest.java @@ -16,7 +16,14 @@ */ package org.apache.solr.update; +import java.util.Iterator; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; + +import com.carrotsearch.randomizedtesting.generators.RandomStrings; import org.apache.lucene.document.Document; +import org.apache.lucene.index.IndexableField; import org.apache.lucene.util.TestUtil; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.common.SolrDocument; @@ -25,6 +32,8 @@ import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputField; import org.apache.solr.core.SolrCore; import org.apache.solr.schema.FieldType; +import org.junit.After; +import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; @@ -33,12 +42,23 @@ import org.junit.Test; * */ public class DocumentBuilderTest extends SolrTestCaseJ4 { + static final int save_min_len = DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST; @BeforeClass public static void beforeClass() throws Exception { initCore("solrconfig.xml", "schema.xml"); } + @AfterClass + public static void afterClass() { + DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST = save_min_len; + } + + @After + public void afterTest() { + DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST = save_min_len; + } + @Test public void testBuildDocument() throws Exception { @@ -222,7 +242,54 @@ public class DocumentBuilderTest extends SolrTestCaseJ4 { sif2.setName("foo"); assertFalse(assertSolrInputFieldEquals(sif1, sif2)); + } + public void testMoveLargestLast() { + SolrInputDocument inDoc = new SolrInputDocument(); + String TEXT_FLD = "text"; // not stored. It won't be moved. This value is the longest, however. + inDoc.addField(TEXT_FLD, + "NOT STORED|" + RandomStrings.randomAsciiOfLength(random(), 4 * DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST)); + + String CAT_FLD = "cat"; // stored, multiValued + inDoc.addField(CAT_FLD, + "STORED V1|"); + // pretty long value + inDoc.addField(CAT_FLD, + "STORED V2|" + RandomStrings.randomAsciiOfLength(random(), 2 * DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST)); + inDoc.addField(CAT_FLD, + "STORED V3|" + RandomStrings.randomAsciiOfLength(random(), DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST)); + + String SUBJECT_FLD = "subject"; // stored. This value is long, but not long enough. + inDoc.addField(SUBJECT_FLD, + "2ndplace|" + RandomStrings.randomAsciiOfLength(random(), DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST)); + + Document outDoc = DocumentBuilder.toDocument(inDoc, h.getCore().getLatestSchema()); + + // filter outDoc by stored fields; convert to list. + List storedFields = StreamSupport.stream(outDoc.spliterator(), false) + .filter(f -> f.fieldType().stored()).collect(Collectors.toList()); + // clip to last 3. We expect these to be for CAT_FLD + storedFields = storedFields.subList(storedFields.size() - 3, storedFields.size()); + + Iterator fieldIterator = storedFields.iterator(); + IndexableField field; + + // Test that we retained the particular value ordering, even though though the 2nd of three was longest + + assertTrue(fieldIterator.hasNext()); + field = fieldIterator.next(); + assertEquals(CAT_FLD, field.name()); + assertTrue(field.stringValue().startsWith("STORED V1|")); + + assertTrue(fieldIterator.hasNext()); + field = fieldIterator.next(); + assertEquals(CAT_FLD, field.name()); + assertTrue(field.stringValue().startsWith("STORED V2|")); + + assertTrue(fieldIterator.hasNext()); + field = fieldIterator.next(); + assertEquals(CAT_FLD, field.name()); + assertTrue(field.stringValue().startsWith("STORED V3|")); } }