Speed up string sort with custom missing value
Today if the user supplies a custom missing value for a string sort, we do it in an extremely slow way, not using ordinals but dereferencing bytes for every document. Ordinals are only used if the missing value is _first or _last. Instead, use ordinals with custom missing values too. Closes #7005
This commit is contained in:
parent
f130d60b72
commit
d8cd755445
|
@ -74,39 +74,38 @@ public class BytesRefFieldComparatorSource extends IndexFieldData.XFieldComparat
|
|||
final boolean sortMissingLast = sortMissingLast(missingValue) ^ reversed;
|
||||
final BytesRef missingBytes = (BytesRef) missingObject(missingValue, reversed);
|
||||
if (indexFieldData instanceof IndexOrdinalsFieldData) {
|
||||
// The ordinal-based comparator only supports sorting missing values first or last so when
|
||||
// a missing value is provided we fall back to the (slow) value-based comparator
|
||||
// TODO: handle arbitrary missing values via a selector
|
||||
if (sortMissingFirst(missingValue) || sortMissingLast(missingValue)) {
|
||||
return new FieldComparator.TermOrdValComparator(numHits, null, sortMissingLast) {
|
||||
|
||||
@Override
|
||||
protected SortedDocValues getSortedDocValues(AtomicReaderContext context, String field) throws IOException {
|
||||
final RandomAccessOrds values = ((IndexOrdinalsFieldData) indexFieldData).load(context).getOrdinalsValues();
|
||||
final SortedDocValues selectedValues;
|
||||
if (nested == null) {
|
||||
selectedValues = sortMode.select(values);
|
||||
} else {
|
||||
final FixedBitSet rootDocs = nested.rootDocs(context);
|
||||
final FixedBitSet innerDocs = nested.innerDocs(context);
|
||||
selectedValues = sortMode.select(values, rootDocs, innerDocs);
|
||||
}
|
||||
return new FieldComparator.TermOrdValComparator(numHits, null, sortMissingLast) {
|
||||
|
||||
@Override
|
||||
protected SortedDocValues getSortedDocValues(AtomicReaderContext context, String field) throws IOException {
|
||||
final RandomAccessOrds values = ((IndexOrdinalsFieldData) indexFieldData).load(context).getOrdinalsValues();
|
||||
final SortedDocValues selectedValues;
|
||||
if (nested == null) {
|
||||
selectedValues = sortMode.select(values);
|
||||
} else {
|
||||
final FixedBitSet rootDocs = nested.rootDocs(context);
|
||||
final FixedBitSet innerDocs = nested.innerDocs(context);
|
||||
selectedValues = sortMode.select(values, rootDocs, innerDocs);
|
||||
}
|
||||
if (sortMissingFirst(missingValue) || sortMissingLast(missingValue)) {
|
||||
return selectedValues;
|
||||
} else {
|
||||
return new ReplaceMissing(selectedValues, missingBytes);
|
||||
}
|
||||
|
||||
public BytesRef value(int slot) {
|
||||
// TODO: When serializing the response to the coordinating node, we lose the information about
|
||||
// whether the comparator sorts missing docs first or last. We should fix it and let
|
||||
// TopDocs.merge deal with it (it knows how to)
|
||||
BytesRef value = super.value(slot);
|
||||
if (value == null) {
|
||||
value = missingBytes;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
public BytesRef value(int slot) {
|
||||
// TODO: When serializing the response to the coordinating node, we lose the information about
|
||||
// whether the comparator sorts missing docs first or last. We should fix it and let
|
||||
// TopDocs.merge deal with it (it knows how to)
|
||||
BytesRef value = super.value(slot);
|
||||
if (value == null) {
|
||||
value = missingBytes;
|
||||
}
|
||||
|
||||
};
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
final BytesRef nullPlaceHolder = new BytesRef();
|
||||
|
@ -153,6 +152,65 @@ public class BytesRefFieldComparatorSource extends IndexFieldData.XFieldComparat
|
|||
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* A view of a SortedDocValues where missing values
|
||||
* are replaced with the specified term
|
||||
*/
|
||||
// TODO: move this out if we need it for other reasons
|
||||
static class ReplaceMissing extends SortedDocValues {
|
||||
final SortedDocValues in;
|
||||
final int substituteOrd;
|
||||
final BytesRef substituteTerm;
|
||||
final boolean exists;
|
||||
|
||||
ReplaceMissing(SortedDocValues in, BytesRef term) {
|
||||
this.in = in;
|
||||
this.substituteTerm = term;
|
||||
int sub = in.lookupTerm(term);
|
||||
if (sub < 0) {
|
||||
substituteOrd = -sub-1;
|
||||
exists = false;
|
||||
} else {
|
||||
substituteOrd = sub;
|
||||
exists = true;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getOrd(int docID) {
|
||||
int ord = in.getOrd(docID);
|
||||
if (ord < 0) {
|
||||
return substituteOrd;
|
||||
} else if (exists == false && ord >= substituteOrd) {
|
||||
return ord + 1;
|
||||
} else {
|
||||
return ord;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getValueCount() {
|
||||
if (exists) {
|
||||
return in.getValueCount();
|
||||
} else {
|
||||
return in.getValueCount() + 1;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef lookupOrd(int ord) {
|
||||
if (ord == substituteOrd) {
|
||||
return substituteTerm;
|
||||
} else if (exists == false && ord > substituteOrd) {
|
||||
return in.lookupOrd(ord-1);
|
||||
} else {
|
||||
return in.lookupOrd(ord);
|
||||
}
|
||||
}
|
||||
|
||||
// we let termsenum etc fall back to the default implementation
|
||||
}
|
||||
|
||||
static {
|
||||
assert Lucene.VERSION == Version.LUCENE_4_9 : "The comparator below is a raw copy of Lucene's, remove it when upgrading to 4.10";
|
||||
|
|
|
@ -0,0 +1,110 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.fielddata.fieldcomparator;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.SortedDocValuesField;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
|
||||
import org.elasticsearch.test.ElasticsearchLuceneTestCase;
|
||||
|
||||
@SuppressCodecs({ "Lucene3x", "Lucene40", "Lucene41", "Lucene42" }) // these codecs dont support missing values
|
||||
public class TestReplaceMissing extends ElasticsearchLuceneTestCase {
|
||||
|
||||
public void test() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, null);
|
||||
iwc.setMergePolicy(newLogMergePolicy());
|
||||
IndexWriter iw = new IndexWriter(dir, iwc);
|
||||
|
||||
Document doc = new Document();
|
||||
doc.add(new SortedDocValuesField("field", new BytesRef("cat")));
|
||||
iw.addDocument(doc);
|
||||
|
||||
doc = new Document();
|
||||
iw.addDocument(doc);
|
||||
|
||||
doc = new Document();
|
||||
doc.add(new SortedDocValuesField("field", new BytesRef("dog")));
|
||||
iw.addDocument(doc);
|
||||
iw.forceMerge(1);
|
||||
iw.close();
|
||||
|
||||
DirectoryReader reader = DirectoryReader.open(dir);
|
||||
AtomicReader ar = getOnlySegmentReader(reader);
|
||||
SortedDocValues raw = ar.getSortedDocValues("field");
|
||||
assertEquals(2, raw.getValueCount());
|
||||
|
||||
// existing values
|
||||
SortedDocValues dv = new BytesRefFieldComparatorSource.ReplaceMissing(raw, new BytesRef("cat"));
|
||||
assertEquals(2, dv.getValueCount());
|
||||
assertEquals("cat", dv.lookupOrd(0).utf8ToString());
|
||||
assertEquals("dog", dv.lookupOrd(1).utf8ToString());
|
||||
|
||||
assertEquals(0, dv.getOrd(0));
|
||||
assertEquals(0, dv.getOrd(1));
|
||||
assertEquals(1, dv.getOrd(2));
|
||||
|
||||
dv = new BytesRefFieldComparatorSource.ReplaceMissing(raw, new BytesRef("dog"));
|
||||
assertEquals(2, dv.getValueCount());
|
||||
assertEquals("cat", dv.lookupOrd(0).utf8ToString());
|
||||
assertEquals("dog", dv.lookupOrd(1).utf8ToString());
|
||||
|
||||
assertEquals(0, dv.getOrd(0));
|
||||
assertEquals(1, dv.getOrd(1));
|
||||
assertEquals(1, dv.getOrd(2));
|
||||
|
||||
// non-existing values
|
||||
dv = new BytesRefFieldComparatorSource.ReplaceMissing(raw, new BytesRef("apple"));
|
||||
assertEquals(3, dv.getValueCount());
|
||||
assertEquals("apple", dv.lookupOrd(0).utf8ToString());
|
||||
assertEquals("cat", dv.lookupOrd(1).utf8ToString());
|
||||
assertEquals("dog", dv.lookupOrd(2).utf8ToString());
|
||||
|
||||
assertEquals(1, dv.getOrd(0));
|
||||
assertEquals(0, dv.getOrd(1));
|
||||
assertEquals(2, dv.getOrd(2));
|
||||
|
||||
dv = new BytesRefFieldComparatorSource.ReplaceMissing(raw, new BytesRef("company"));
|
||||
assertEquals(3, dv.getValueCount());
|
||||
assertEquals("cat", dv.lookupOrd(0).utf8ToString());
|
||||
assertEquals("company", dv.lookupOrd(1).utf8ToString());
|
||||
assertEquals("dog", dv.lookupOrd(2).utf8ToString());
|
||||
|
||||
assertEquals(0, dv.getOrd(0));
|
||||
assertEquals(1, dv.getOrd(1));
|
||||
assertEquals(2, dv.getOrd(2));
|
||||
|
||||
dv = new BytesRefFieldComparatorSource.ReplaceMissing(raw, new BytesRef("ebay"));
|
||||
assertEquals(3, dv.getValueCount());
|
||||
assertEquals("cat", dv.lookupOrd(0).utf8ToString());
|
||||
assertEquals("dog", dv.lookupOrd(1).utf8ToString());
|
||||
assertEquals("ebay", dv.lookupOrd(2).utf8ToString());
|
||||
|
||||
assertEquals(0, dv.getOrd(0));
|
||||
assertEquals(2, dv.getOrd(1));
|
||||
assertEquals(1, dv.getOrd(2));
|
||||
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue