mirror of https://github.com/apache/lucene.git
SOLR-10273: DocumentBuilder move longest field to last position
This commit is contained in:
parent
1ffd7604ee
commit
8100cbfdf1
|
@ -301,6 +301,10 @@ Optimizations
|
|||
* SOLR-10143: PointFields will create IndexOrDocValuesQuery when a field is both, indexed=true and docValues=true
|
||||
(Tomás Fernández Löbbe)
|
||||
|
||||
* SOLR-10273: The field with the longest value (if it exceeds 4K) is moved to be last in the Lucene Document in order
|
||||
to benefit from stored field optimizations in Lucene that can avoid reading it when it's not needed. If the field is
|
||||
multi-valued, they all move together to the end to retain order. (David Smiley)
|
||||
|
||||
Other Changes
|
||||
----------------------
|
||||
* SOLR-9980: Expose configVersion in core admin status (Jessica Cheng Mallet via Tomás Fernández Löbbe)
|
||||
|
|
|
@ -16,12 +16,15 @@
|
|||
*/
|
||||
package org.apache.solr.update;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.NumericDocValuesField;
|
||||
import org.apache.lucene.index.IndexableField;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.common.SolrInputField;
|
||||
|
@ -33,10 +36,13 @@ import org.apache.solr.schema.SchemaField;
|
|||
import com.google.common.collect.Sets;
|
||||
|
||||
/**
|
||||
*
|
||||
* Builds a Lucene {@link Document} from a {@link SolrInputDocument}.
|
||||
*/
|
||||
public class DocumentBuilder {
|
||||
|
||||
// accessible only for tests
|
||||
static int MIN_LENGTH_TO_MOVE_LAST = Integer.getInteger("solr.docBuilder.minLengthToMoveLast", 4*1024); // internal setting
|
||||
|
||||
/**
|
||||
* Add a field value to a given document.
|
||||
* @param doc Document that the field needs to be added to
|
||||
|
@ -227,6 +233,58 @@ public class DocumentBuilder {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!forInPlaceUpdate) {
|
||||
moveLargestFieldLast(out);
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
/** Move the largest stored field last, because Lucene can avoid loading that one if it's not needed. */
|
||||
private static void moveLargestFieldLast(Document doc) {
|
||||
String largestField = null;
|
||||
int largestFieldLen = -1;
|
||||
boolean largestIsLast = true;
|
||||
for (IndexableField field : doc) {
|
||||
if (!field.fieldType().stored()) {
|
||||
continue;
|
||||
}
|
||||
if (largestIsLast && !field.name().equals(largestField)) {
|
||||
largestIsLast = false;
|
||||
}
|
||||
if (field.numericValue() != null) { // just ignore these as non-competitive (avoid toString'ing their number)
|
||||
continue;
|
||||
}
|
||||
String strVal = field.stringValue();
|
||||
if (strVal != null) {
|
||||
if (strVal.length() > largestFieldLen) {
|
||||
largestField = field.name();
|
||||
largestFieldLen = strVal.length();
|
||||
largestIsLast = true;
|
||||
}
|
||||
} else {
|
||||
BytesRef bytesRef = field.binaryValue();
|
||||
if (bytesRef != null && bytesRef.length > largestFieldLen) {
|
||||
largestField = field.name();
|
||||
largestFieldLen = bytesRef.length;
|
||||
largestIsLast = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!largestIsLast && largestField != null && largestFieldLen > MIN_LENGTH_TO_MOVE_LAST) { // only bother if the value isn't tiny
|
||||
LinkedList<IndexableField> addToEnd = new LinkedList<>();
|
||||
Iterator<IndexableField> iterator = doc.iterator();
|
||||
while (iterator.hasNext()) {
|
||||
IndexableField field = iterator.next();
|
||||
if (field.name().equals(largestField)) {
|
||||
addToEnd.add(field);
|
||||
iterator.remove(); // Document may not have "remove" but it's iterator allows mutation
|
||||
}
|
||||
}
|
||||
for (IndexableField field : addToEnd) {
|
||||
doc.add(field);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,7 +16,14 @@
|
|||
*/
|
||||
package org.apache.solr.update;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomStrings;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.IndexableField;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
|
@ -25,6 +32,8 @@ import org.apache.solr.common.SolrInputDocument;
|
|||
import org.apache.solr.common.SolrInputField;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.schema.FieldType;
|
||||
import org.junit.After;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
|
@ -33,12 +42,23 @@ import org.junit.Test;
|
|||
*
|
||||
*/
|
||||
public class DocumentBuilderTest extends SolrTestCaseJ4 {
|
||||
static final int save_min_len = DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST;
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
initCore("solrconfig.xml", "schema.xml");
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public static void afterClass() {
|
||||
DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST = save_min_len;
|
||||
}
|
||||
|
||||
@After
|
||||
public void afterTest() {
|
||||
DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST = save_min_len;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBuildDocument() throws Exception
|
||||
{
|
||||
|
@ -222,7 +242,54 @@ public class DocumentBuilderTest extends SolrTestCaseJ4 {
|
|||
sif2.setName("foo");
|
||||
assertFalse(assertSolrInputFieldEquals(sif1, sif2));
|
||||
|
||||
}
|
||||
|
||||
public void testMoveLargestLast() {
|
||||
SolrInputDocument inDoc = new SolrInputDocument();
|
||||
String TEXT_FLD = "text"; // not stored. It won't be moved. This value is the longest, however.
|
||||
inDoc.addField(TEXT_FLD,
|
||||
"NOT STORED|" + RandomStrings.randomAsciiOfLength(random(), 4 * DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST));
|
||||
|
||||
String CAT_FLD = "cat"; // stored, multiValued
|
||||
inDoc.addField(CAT_FLD,
|
||||
"STORED V1|");
|
||||
// pretty long value
|
||||
inDoc.addField(CAT_FLD,
|
||||
"STORED V2|" + RandomStrings.randomAsciiOfLength(random(), 2 * DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST));
|
||||
inDoc.addField(CAT_FLD,
|
||||
"STORED V3|" + RandomStrings.randomAsciiOfLength(random(), DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST));
|
||||
|
||||
String SUBJECT_FLD = "subject"; // stored. This value is long, but not long enough.
|
||||
inDoc.addField(SUBJECT_FLD,
|
||||
"2ndplace|" + RandomStrings.randomAsciiOfLength(random(), DocumentBuilder.MIN_LENGTH_TO_MOVE_LAST));
|
||||
|
||||
Document outDoc = DocumentBuilder.toDocument(inDoc, h.getCore().getLatestSchema());
|
||||
|
||||
// filter outDoc by stored fields; convert to list.
|
||||
List<IndexableField> storedFields = StreamSupport.stream(outDoc.spliterator(), false)
|
||||
.filter(f -> f.fieldType().stored()).collect(Collectors.toList());
|
||||
// clip to last 3. We expect these to be for CAT_FLD
|
||||
storedFields = storedFields.subList(storedFields.size() - 3, storedFields.size());
|
||||
|
||||
Iterator<IndexableField> fieldIterator = storedFields.iterator();
|
||||
IndexableField field;
|
||||
|
||||
// Test that we retained the particular value ordering, even though though the 2nd of three was longest
|
||||
|
||||
assertTrue(fieldIterator.hasNext());
|
||||
field = fieldIterator.next();
|
||||
assertEquals(CAT_FLD, field.name());
|
||||
assertTrue(field.stringValue().startsWith("STORED V1|"));
|
||||
|
||||
assertTrue(fieldIterator.hasNext());
|
||||
field = fieldIterator.next();
|
||||
assertEquals(CAT_FLD, field.name());
|
||||
assertTrue(field.stringValue().startsWith("STORED V2|"));
|
||||
|
||||
assertTrue(fieldIterator.hasNext());
|
||||
field = fieldIterator.next();
|
||||
assertEquals(CAT_FLD, field.name());
|
||||
assertTrue(field.stringValue().startsWith("STORED V3|"));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue