From d8cd75544536a211fa5c66e190aa4313213f0308 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 24 Jul 2014 06:27:59 -0400 Subject: [PATCH] Speed up string sort with custom missing value Today if the user supplies a custom missing value for a string sort, we do it in an extremely slow way, not using ordinals but dereferencing bytes for every document. Ordinals are only used if the missing value is _first or _last. Instead, use ordinals with custom missing values too. Closes #7005 --- .../BytesRefFieldComparatorSource.java | 118 +++++++++++++----- .../fieldcomparator/TestReplaceMissing.java | 110 ++++++++++++++++ 2 files changed, 198 insertions(+), 30 deletions(-) create mode 100644 src/test/java/org/elasticsearch/index/fielddata/fieldcomparator/TestReplaceMissing.java diff --git a/src/main/java/org/elasticsearch/index/fielddata/fieldcomparator/BytesRefFieldComparatorSource.java b/src/main/java/org/elasticsearch/index/fielddata/fieldcomparator/BytesRefFieldComparatorSource.java index a4d2c273f50..940169f7813 100644 --- a/src/main/java/org/elasticsearch/index/fielddata/fieldcomparator/BytesRefFieldComparatorSource.java +++ b/src/main/java/org/elasticsearch/index/fielddata/fieldcomparator/BytesRefFieldComparatorSource.java @@ -74,39 +74,38 @@ public class BytesRefFieldComparatorSource extends IndexFieldData.XFieldComparat final boolean sortMissingLast = sortMissingLast(missingValue) ^ reversed; final BytesRef missingBytes = (BytesRef) missingObject(missingValue, reversed); if (indexFieldData instanceof IndexOrdinalsFieldData) { - // The ordinal-based comparator only supports sorting missing values first or last so when - // a missing value is provided we fall back to the (slow) value-based comparator - // TODO: handle arbitrary missing values via a selector - if (sortMissingFirst(missingValue) || sortMissingLast(missingValue)) { - return new FieldComparator.TermOrdValComparator(numHits, null, sortMissingLast) { - - @Override - protected SortedDocValues getSortedDocValues(AtomicReaderContext context, String field) throws IOException { - final RandomAccessOrds values = ((IndexOrdinalsFieldData) indexFieldData).load(context).getOrdinalsValues(); - final SortedDocValues selectedValues; - if (nested == null) { - selectedValues = sortMode.select(values); - } else { - final FixedBitSet rootDocs = nested.rootDocs(context); - final FixedBitSet innerDocs = nested.innerDocs(context); - selectedValues = sortMode.select(values, rootDocs, innerDocs); - } + return new FieldComparator.TermOrdValComparator(numHits, null, sortMissingLast) { + + @Override + protected SortedDocValues getSortedDocValues(AtomicReaderContext context, String field) throws IOException { + final RandomAccessOrds values = ((IndexOrdinalsFieldData) indexFieldData).load(context).getOrdinalsValues(); + final SortedDocValues selectedValues; + if (nested == null) { + selectedValues = sortMode.select(values); + } else { + final FixedBitSet rootDocs = nested.rootDocs(context); + final FixedBitSet innerDocs = nested.innerDocs(context); + selectedValues = sortMode.select(values, rootDocs, innerDocs); + } + if (sortMissingFirst(missingValue) || sortMissingLast(missingValue)) { return selectedValues; + } else { + return new ReplaceMissing(selectedValues, missingBytes); } - - public BytesRef value(int slot) { - // TODO: When serializing the response to the coordinating node, we lose the information about - // whether the comparator sorts missing docs first or last. We should fix it and let - // TopDocs.merge deal with it (it knows how to) - BytesRef value = super.value(slot); - if (value == null) { - value = missingBytes; - } - return value; + } + + public BytesRef value(int slot) { + // TODO: When serializing the response to the coordinating node, we lose the information about + // whether the comparator sorts missing docs first or last. We should fix it and let + // TopDocs.merge deal with it (it knows how to) + BytesRef value = super.value(slot); + if (value == null) { + value = missingBytes; } - - }; - } + return value; + } + + }; } final BytesRef nullPlaceHolder = new BytesRef(); @@ -153,6 +152,65 @@ public class BytesRefFieldComparatorSource extends IndexFieldData.XFieldComparat }; } + + /** + * A view of a SortedDocValues where missing values + * are replaced with the specified term + */ + // TODO: move this out if we need it for other reasons + static class ReplaceMissing extends SortedDocValues { + final SortedDocValues in; + final int substituteOrd; + final BytesRef substituteTerm; + final boolean exists; + + ReplaceMissing(SortedDocValues in, BytesRef term) { + this.in = in; + this.substituteTerm = term; + int sub = in.lookupTerm(term); + if (sub < 0) { + substituteOrd = -sub-1; + exists = false; + } else { + substituteOrd = sub; + exists = true; + } + } + + @Override + public int getOrd(int docID) { + int ord = in.getOrd(docID); + if (ord < 0) { + return substituteOrd; + } else if (exists == false && ord >= substituteOrd) { + return ord + 1; + } else { + return ord; + } + } + + @Override + public int getValueCount() { + if (exists) { + return in.getValueCount(); + } else { + return in.getValueCount() + 1; + } + } + + @Override + public BytesRef lookupOrd(int ord) { + if (ord == substituteOrd) { + return substituteTerm; + } else if (exists == false && ord > substituteOrd) { + return in.lookupOrd(ord-1); + } else { + return in.lookupOrd(ord); + } + } + + // we let termsenum etc fall back to the default implementation + } static { assert Lucene.VERSION == Version.LUCENE_4_9 : "The comparator below is a raw copy of Lucene's, remove it when upgrading to 4.10"; diff --git a/src/test/java/org/elasticsearch/index/fielddata/fieldcomparator/TestReplaceMissing.java b/src/test/java/org/elasticsearch/index/fielddata/fieldcomparator/TestReplaceMissing.java new file mode 100644 index 00000000000..3241d3fb3bd --- /dev/null +++ b/src/test/java/org/elasticsearch/index/fielddata/fieldcomparator/TestReplaceMissing.java @@ -0,0 +1,110 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.fielddata.fieldcomparator; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.SortedDocValuesField; +import org.apache.lucene.index.*; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; +import org.elasticsearch.test.ElasticsearchLuceneTestCase; + +@SuppressCodecs({ "Lucene3x", "Lucene40", "Lucene41", "Lucene42" }) // these codecs dont support missing values +public class TestReplaceMissing extends ElasticsearchLuceneTestCase { + + public void test() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, null); + iwc.setMergePolicy(newLogMergePolicy()); + IndexWriter iw = new IndexWriter(dir, iwc); + + Document doc = new Document(); + doc.add(new SortedDocValuesField("field", new BytesRef("cat"))); + iw.addDocument(doc); + + doc = new Document(); + iw.addDocument(doc); + + doc = new Document(); + doc.add(new SortedDocValuesField("field", new BytesRef("dog"))); + iw.addDocument(doc); + iw.forceMerge(1); + iw.close(); + + DirectoryReader reader = DirectoryReader.open(dir); + AtomicReader ar = getOnlySegmentReader(reader); + SortedDocValues raw = ar.getSortedDocValues("field"); + assertEquals(2, raw.getValueCount()); + + // existing values + SortedDocValues dv = new BytesRefFieldComparatorSource.ReplaceMissing(raw, new BytesRef("cat")); + assertEquals(2, dv.getValueCount()); + assertEquals("cat", dv.lookupOrd(0).utf8ToString()); + assertEquals("dog", dv.lookupOrd(1).utf8ToString()); + + assertEquals(0, dv.getOrd(0)); + assertEquals(0, dv.getOrd(1)); + assertEquals(1, dv.getOrd(2)); + + dv = new BytesRefFieldComparatorSource.ReplaceMissing(raw, new BytesRef("dog")); + assertEquals(2, dv.getValueCount()); + assertEquals("cat", dv.lookupOrd(0).utf8ToString()); + assertEquals("dog", dv.lookupOrd(1).utf8ToString()); + + assertEquals(0, dv.getOrd(0)); + assertEquals(1, dv.getOrd(1)); + assertEquals(1, dv.getOrd(2)); + + // non-existing values + dv = new BytesRefFieldComparatorSource.ReplaceMissing(raw, new BytesRef("apple")); + assertEquals(3, dv.getValueCount()); + assertEquals("apple", dv.lookupOrd(0).utf8ToString()); + assertEquals("cat", dv.lookupOrd(1).utf8ToString()); + assertEquals("dog", dv.lookupOrd(2).utf8ToString()); + + assertEquals(1, dv.getOrd(0)); + assertEquals(0, dv.getOrd(1)); + assertEquals(2, dv.getOrd(2)); + + dv = new BytesRefFieldComparatorSource.ReplaceMissing(raw, new BytesRef("company")); + assertEquals(3, dv.getValueCount()); + assertEquals("cat", dv.lookupOrd(0).utf8ToString()); + assertEquals("company", dv.lookupOrd(1).utf8ToString()); + assertEquals("dog", dv.lookupOrd(2).utf8ToString()); + + assertEquals(0, dv.getOrd(0)); + assertEquals(1, dv.getOrd(1)); + assertEquals(2, dv.getOrd(2)); + + dv = new BytesRefFieldComparatorSource.ReplaceMissing(raw, new BytesRef("ebay")); + assertEquals(3, dv.getValueCount()); + assertEquals("cat", dv.lookupOrd(0).utf8ToString()); + assertEquals("dog", dv.lookupOrd(1).utf8ToString()); + assertEquals("ebay", dv.lookupOrd(2).utf8ToString()); + + assertEquals(0, dv.getOrd(0)); + assertEquals(2, dv.getOrd(1)); + assertEquals(1, dv.getOrd(2)); + + reader.close(); + dir.close(); + } +}