Speed up string sort with custom missing value

Today if the user supplies a custom missing value for a string sort,
we do it in an extremely slow way, not using ordinals but dereferencing
bytes for every document. Ordinals are only used if the missing value
is _first or _last.

Instead, use ordinals with custom missing values too.

Closes #7005
This commit is contained in:
Robert Muir 2014-07-24 06:27:59 -04:00
parent f130d60b72
commit d8cd755445
2 changed files with 198 additions and 30 deletions

View File

@ -74,10 +74,6 @@ public class BytesRefFieldComparatorSource extends IndexFieldData.XFieldComparat
final boolean sortMissingLast = sortMissingLast(missingValue) ^ reversed;
final BytesRef missingBytes = (BytesRef) missingObject(missingValue, reversed);
if (indexFieldData instanceof IndexOrdinalsFieldData) {
// The ordinal-based comparator only supports sorting missing values first or last so when
// a missing value is provided we fall back to the (slow) value-based comparator
// TODO: handle arbitrary missing values via a selector
if (sortMissingFirst(missingValue) || sortMissingLast(missingValue)) {
return new FieldComparator.TermOrdValComparator(numHits, null, sortMissingLast) {
@Override
@ -91,7 +87,11 @@ public class BytesRefFieldComparatorSource extends IndexFieldData.XFieldComparat
final FixedBitSet innerDocs = nested.innerDocs(context);
selectedValues = sortMode.select(values, rootDocs, innerDocs);
}
if (sortMissingFirst(missingValue) || sortMissingLast(missingValue)) {
return selectedValues;
} else {
return new ReplaceMissing(selectedValues, missingBytes);
}
}
public BytesRef value(int slot) {
@ -107,7 +107,6 @@ public class BytesRefFieldComparatorSource extends IndexFieldData.XFieldComparat
};
}
}
final BytesRef nullPlaceHolder = new BytesRef();
final BytesRef nonNullMissingBytes = missingBytes == null ? nullPlaceHolder : missingBytes;
@ -154,6 +153,65 @@ public class BytesRefFieldComparatorSource extends IndexFieldData.XFieldComparat
};
}
/**
* A view of a SortedDocValues where missing values
* are replaced with the specified term
*/
// TODO: move this out if we need it for other reasons
static class ReplaceMissing extends SortedDocValues {
final SortedDocValues in;
final int substituteOrd;
final BytesRef substituteTerm;
final boolean exists;
ReplaceMissing(SortedDocValues in, BytesRef term) {
this.in = in;
this.substituteTerm = term;
int sub = in.lookupTerm(term);
if (sub < 0) {
substituteOrd = -sub-1;
exists = false;
} else {
substituteOrd = sub;
exists = true;
}
}
@Override
public int getOrd(int docID) {
int ord = in.getOrd(docID);
if (ord < 0) {
return substituteOrd;
} else if (exists == false && ord >= substituteOrd) {
return ord + 1;
} else {
return ord;
}
}
@Override
public int getValueCount() {
if (exists) {
return in.getValueCount();
} else {
return in.getValueCount() + 1;
}
}
@Override
public BytesRef lookupOrd(int ord) {
if (ord == substituteOrd) {
return substituteTerm;
} else if (exists == false && ord > substituteOrd) {
return in.lookupOrd(ord-1);
} else {
return in.lookupOrd(ord);
}
}
// we let termsenum etc fall back to the default implementation
}
static {
assert Lucene.VERSION == Version.LUCENE_4_9 : "The comparator below is a raw copy of Lucene's, remove it when upgrading to 4.10";
}

View File

@ -0,0 +1,110 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.fielddata.fieldcomparator;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.index.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
import org.elasticsearch.test.ElasticsearchLuceneTestCase;
@SuppressCodecs({ "Lucene3x", "Lucene40", "Lucene41", "Lucene42" }) // these codecs dont support missing values
public class TestReplaceMissing extends ElasticsearchLuceneTestCase {
public void test() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, null);
iwc.setMergePolicy(newLogMergePolicy());
IndexWriter iw = new IndexWriter(dir, iwc);
Document doc = new Document();
doc.add(new SortedDocValuesField("field", new BytesRef("cat")));
iw.addDocument(doc);
doc = new Document();
iw.addDocument(doc);
doc = new Document();
doc.add(new SortedDocValuesField("field", new BytesRef("dog")));
iw.addDocument(doc);
iw.forceMerge(1);
iw.close();
DirectoryReader reader = DirectoryReader.open(dir);
AtomicReader ar = getOnlySegmentReader(reader);
SortedDocValues raw = ar.getSortedDocValues("field");
assertEquals(2, raw.getValueCount());
// existing values
SortedDocValues dv = new BytesRefFieldComparatorSource.ReplaceMissing(raw, new BytesRef("cat"));
assertEquals(2, dv.getValueCount());
assertEquals("cat", dv.lookupOrd(0).utf8ToString());
assertEquals("dog", dv.lookupOrd(1).utf8ToString());
assertEquals(0, dv.getOrd(0));
assertEquals(0, dv.getOrd(1));
assertEquals(1, dv.getOrd(2));
dv = new BytesRefFieldComparatorSource.ReplaceMissing(raw, new BytesRef("dog"));
assertEquals(2, dv.getValueCount());
assertEquals("cat", dv.lookupOrd(0).utf8ToString());
assertEquals("dog", dv.lookupOrd(1).utf8ToString());
assertEquals(0, dv.getOrd(0));
assertEquals(1, dv.getOrd(1));
assertEquals(1, dv.getOrd(2));
// non-existing values
dv = new BytesRefFieldComparatorSource.ReplaceMissing(raw, new BytesRef("apple"));
assertEquals(3, dv.getValueCount());
assertEquals("apple", dv.lookupOrd(0).utf8ToString());
assertEquals("cat", dv.lookupOrd(1).utf8ToString());
assertEquals("dog", dv.lookupOrd(2).utf8ToString());
assertEquals(1, dv.getOrd(0));
assertEquals(0, dv.getOrd(1));
assertEquals(2, dv.getOrd(2));
dv = new BytesRefFieldComparatorSource.ReplaceMissing(raw, new BytesRef("company"));
assertEquals(3, dv.getValueCount());
assertEquals("cat", dv.lookupOrd(0).utf8ToString());
assertEquals("company", dv.lookupOrd(1).utf8ToString());
assertEquals("dog", dv.lookupOrd(2).utf8ToString());
assertEquals(0, dv.getOrd(0));
assertEquals(1, dv.getOrd(1));
assertEquals(2, dv.getOrd(2));
dv = new BytesRefFieldComparatorSource.ReplaceMissing(raw, new BytesRef("ebay"));
assertEquals(3, dv.getValueCount());
assertEquals("cat", dv.lookupOrd(0).utf8ToString());
assertEquals("dog", dv.lookupOrd(1).utf8ToString());
assertEquals("ebay", dv.lookupOrd(2).utf8ToString());
assertEquals(0, dv.getOrd(0));
assertEquals(2, dv.getOrd(1));
assertEquals(1, dv.getOrd(2));
reader.close();
dir.close();
}
}