mirror of https://github.com/apache/lucene.git
buffer up per-doc sortedbytes ordinals in packedints too
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene4547@1440377 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
3ecb5d8831
commit
7143779b04
|
@ -31,20 +31,19 @@ import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
|
||||||
import org.apache.lucene.util.BytesRefHash;
|
import org.apache.lucene.util.BytesRefHash;
|
||||||
import org.apache.lucene.util.Counter;
|
import org.apache.lucene.util.Counter;
|
||||||
import org.apache.lucene.util.RamUsageEstimator;
|
import org.apache.lucene.util.RamUsageEstimator;
|
||||||
|
import org.apache.lucene.util.packed.AppendingLongBuffer;
|
||||||
|
|
||||||
|
|
||||||
/** Buffers up pending byte[] per doc, deref and sorting via
|
/** Buffers up pending byte[] per doc, deref and sorting via
|
||||||
* int ord, then flushes when segment flushes. */
|
* int ord, then flushes when segment flushes. */
|
||||||
class SortedDocValuesWriter extends DocValuesWriter {
|
class SortedDocValuesWriter extends DocValuesWriter {
|
||||||
final BytesRefHash hash;
|
final BytesRefHash hash;
|
||||||
private int[] pending = new int[DEFAULT_PENDING_SIZE];
|
private AppendingLongBuffer pending;
|
||||||
private int pendingIndex = 0;
|
|
||||||
private final Counter iwBytesUsed;
|
private final Counter iwBytesUsed;
|
||||||
private long bytesUsed;
|
private long bytesUsed; // this currently only tracks differences in 'pending'
|
||||||
private final FieldInfo fieldInfo;
|
private final FieldInfo fieldInfo;
|
||||||
|
|
||||||
private static final BytesRef EMPTY = new BytesRef(BytesRef.EMPTY_BYTES);
|
private static final BytesRef EMPTY = new BytesRef(BytesRef.EMPTY_BYTES);
|
||||||
private static final int DEFAULT_PENDING_SIZE = 16;
|
|
||||||
|
|
||||||
public SortedDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
|
public SortedDocValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
|
||||||
this.fieldInfo = fieldInfo;
|
this.fieldInfo = fieldInfo;
|
||||||
|
@ -54,11 +53,13 @@ class SortedDocValuesWriter extends DocValuesWriter {
|
||||||
new ByteBlockPool.DirectTrackingAllocator(iwBytesUsed)),
|
new ByteBlockPool.DirectTrackingAllocator(iwBytesUsed)),
|
||||||
BytesRefHash.DEFAULT_CAPACITY,
|
BytesRefHash.DEFAULT_CAPACITY,
|
||||||
new DirectBytesStartArray(BytesRefHash.DEFAULT_CAPACITY, iwBytesUsed));
|
new DirectBytesStartArray(BytesRefHash.DEFAULT_CAPACITY, iwBytesUsed));
|
||||||
iwBytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + RamUsageEstimator.NUM_BYTES_INT * DEFAULT_PENDING_SIZE);
|
pending = new AppendingLongBuffer();
|
||||||
|
bytesUsed = pending.ramBytesUsed();
|
||||||
|
iwBytesUsed.addAndGet(bytesUsed);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addValue(int docID, BytesRef value) {
|
public void addValue(int docID, BytesRef value) {
|
||||||
if (docID < pendingIndex) {
|
if (docID < pending.size()) {
|
||||||
throw new IllegalArgumentException("DocValuesField \"" + fieldInfo.name + "\" appears more than once in this document (only one value is allowed per field)");
|
throw new IllegalArgumentException("DocValuesField \"" + fieldInfo.name + "\" appears more than once in this document (only one value is allowed per field)");
|
||||||
}
|
}
|
||||||
if (value == null) {
|
if (value == null) {
|
||||||
|
@ -69,7 +70,7 @@ class SortedDocValuesWriter extends DocValuesWriter {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fill in any holes:
|
// Fill in any holes:
|
||||||
while(pendingIndex < docID) {
|
while(pending.size() < docID) {
|
||||||
addOneValue(EMPTY);
|
addOneValue(EMPTY);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -78,7 +79,8 @@ class SortedDocValuesWriter extends DocValuesWriter {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void finish(int maxDoc) {
|
public void finish(int maxDoc) {
|
||||||
if (pendingIndex < maxDoc) {
|
// nocommit: WTF.. why is this not a while but an if?
|
||||||
|
if (pending.size() < maxDoc) {
|
||||||
addOneValue(EMPTY);
|
addOneValue(EMPTY);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -89,12 +91,14 @@ class SortedDocValuesWriter extends DocValuesWriter {
|
||||||
ord = -ord-1;
|
ord = -ord-1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pendingIndex <= pending.length) {
|
pending.add(ord);
|
||||||
int pendingLen = pending.length;
|
updateBytesUsed();
|
||||||
pending = ArrayUtil.grow(pending, pendingIndex+1);
|
|
||||||
iwBytesUsed.addAndGet((pending.length - pendingLen) * RamUsageEstimator.NUM_BYTES_INT);
|
|
||||||
}
|
}
|
||||||
pending[pendingIndex++] = ord;
|
|
||||||
|
private void updateBytesUsed() {
|
||||||
|
final long newBytesUsed = pending.ramBytesUsed();
|
||||||
|
iwBytesUsed.addAndGet(newBytesUsed - bytesUsed);
|
||||||
|
bytesUsed = newBytesUsed;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -102,7 +106,7 @@ class SortedDocValuesWriter extends DocValuesWriter {
|
||||||
final int maxDoc = state.segmentInfo.getDocCount();
|
final int maxDoc = state.segmentInfo.getDocCount();
|
||||||
|
|
||||||
final int emptyOrd;
|
final int emptyOrd;
|
||||||
if (pendingIndex < maxDoc) {
|
if (pending.size() < maxDoc) {
|
||||||
// Make sure we added EMPTY value before sorting:
|
// Make sure we added EMPTY value before sorting:
|
||||||
int ord = hash.add(EMPTY);
|
int ord = hash.add(EMPTY);
|
||||||
if (ord < 0) {
|
if (ord < 0) {
|
||||||
|
@ -125,8 +129,6 @@ class SortedDocValuesWriter extends DocValuesWriter {
|
||||||
ordMap[sortedValues[ord]] = ord;
|
ordMap[sortedValues[ord]] = ord;
|
||||||
}
|
}
|
||||||
|
|
||||||
final int bufferedDocCount = pendingIndex;
|
|
||||||
|
|
||||||
dvConsumer.addSortedField(fieldInfo,
|
dvConsumer.addSortedField(fieldInfo,
|
||||||
|
|
||||||
// ord -> value
|
// ord -> value
|
||||||
|
@ -141,7 +143,7 @@ class SortedDocValuesWriter extends DocValuesWriter {
|
||||||
new Iterable<Number>() {
|
new Iterable<Number>() {
|
||||||
@Override
|
@Override
|
||||||
public Iterator<Number> iterator() {
|
public Iterator<Number> iterator() {
|
||||||
return new OrdsIterator(ordMap, bufferedDocCount, maxDoc, emptyOrd);
|
return new OrdsIterator(ordMap, maxDoc, emptyOrd);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -185,15 +187,15 @@ class SortedDocValuesWriter extends DocValuesWriter {
|
||||||
|
|
||||||
// iterates over the ords for each doc we have in ram
|
// iterates over the ords for each doc we have in ram
|
||||||
private class OrdsIterator implements Iterator<Number> {
|
private class OrdsIterator implements Iterator<Number> {
|
||||||
|
final AppendingLongBuffer.Iterator iter = pending.iterator();
|
||||||
final int ordMap[];
|
final int ordMap[];
|
||||||
final int size;
|
final int size = pending.size();
|
||||||
final int maxDoc;
|
final int maxDoc;
|
||||||
final int emptyOrd; // nocommit
|
final int emptyOrd; // nocommit
|
||||||
int docUpto;
|
int docUpto;
|
||||||
|
|
||||||
OrdsIterator(int ordMap[], int size, int maxDoc, int emptyOrd) {
|
OrdsIterator(int ordMap[], int maxDoc, int emptyOrd) {
|
||||||
this.ordMap = ordMap;
|
this.ordMap = ordMap;
|
||||||
this.size = size;
|
|
||||||
this.maxDoc = maxDoc;
|
this.maxDoc = maxDoc;
|
||||||
this.emptyOrd = emptyOrd;
|
this.emptyOrd = emptyOrd;
|
||||||
}
|
}
|
||||||
|
@ -210,7 +212,7 @@ class SortedDocValuesWriter extends DocValuesWriter {
|
||||||
}
|
}
|
||||||
int ord;
|
int ord;
|
||||||
if (docUpto < size) {
|
if (docUpto < size) {
|
||||||
ord = pending[docUpto];
|
ord = (int) iter.next();
|
||||||
} else {
|
} else {
|
||||||
ord = emptyOrd;
|
ord = emptyOrd;
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,97 @@
|
||||||
|
package org.apache.lucene.index;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
|
import org.apache.lucene.document.BinaryDocValuesField;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.SortedDocValuesField;
|
||||||
|
import org.apache.lucene.store.BaseDirectoryWrapper;
|
||||||
|
import org.apache.lucene.store.ByteArrayDataInput;
|
||||||
|
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||||
|
import org.apache.lucene.store.MockDirectoryWrapper;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util.TimeUnits;
|
||||||
|
import org.apache.lucene.util._TestUtil;
|
||||||
|
import org.junit.Ignore;
|
||||||
|
|
||||||
|
import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
|
||||||
|
|
||||||
|
@TimeoutSuite(millis = 80 * TimeUnits.HOUR)
|
||||||
|
@Ignore("takes ?? minutes")
|
||||||
|
public class Test2BSortedDocValues extends LuceneTestCase {
|
||||||
|
|
||||||
|
// indexes Integer.MAX_VALUE docs with a fixed binary field
|
||||||
|
public void testFixedSorted() throws Exception {
|
||||||
|
BaseDirectoryWrapper dir = newFSDirectory(_TestUtil.getTempDir("2BFixedSorted"));
|
||||||
|
if (dir instanceof MockDirectoryWrapper) {
|
||||||
|
((MockDirectoryWrapper)dir).setThrottling(MockDirectoryWrapper.Throttling.NEVER);
|
||||||
|
}
|
||||||
|
|
||||||
|
IndexWriter w = new IndexWriter(dir,
|
||||||
|
new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))
|
||||||
|
.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
|
||||||
|
.setRAMBufferSizeMB(256.0)
|
||||||
|
.setMergeScheduler(new ConcurrentMergeScheduler())
|
||||||
|
.setMergePolicy(newLogMergePolicy(false, 10))
|
||||||
|
.setOpenMode(IndexWriterConfig.OpenMode.CREATE));
|
||||||
|
|
||||||
|
Document doc = new Document();
|
||||||
|
byte bytes[] = new byte[2];
|
||||||
|
BytesRef data = new BytesRef(bytes);
|
||||||
|
SortedDocValuesField dvField = new SortedDocValuesField("dv", data);
|
||||||
|
doc.add(dvField);
|
||||||
|
|
||||||
|
for (int i = 0; i < Integer.MAX_VALUE; i++) {
|
||||||
|
bytes[0] = (byte)(i >> 8);
|
||||||
|
bytes[1] = (byte) i;
|
||||||
|
w.addDocument(doc);
|
||||||
|
if (i % 100000 == 0) {
|
||||||
|
System.out.println("indexed: " + i);
|
||||||
|
System.out.flush();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
w.forceMerge(1);
|
||||||
|
w.close();
|
||||||
|
|
||||||
|
System.out.println("verifying...");
|
||||||
|
System.out.flush();
|
||||||
|
|
||||||
|
DirectoryReader r = DirectoryReader.open(dir);
|
||||||
|
int expectedValue = 0;
|
||||||
|
for (AtomicReaderContext context : r.leaves()) {
|
||||||
|
AtomicReader reader = context.reader();
|
||||||
|
BytesRef scratch = new BytesRef();
|
||||||
|
BinaryDocValues dv = reader.getSortedDocValues("dv");
|
||||||
|
for (int i = 0; i < reader.maxDoc(); i++) {
|
||||||
|
bytes[0] = (byte)(expectedValue >> 8);
|
||||||
|
bytes[1] = (byte) expectedValue;
|
||||||
|
dv.get(i, scratch);
|
||||||
|
assertEquals(data, scratch);
|
||||||
|
expectedValue++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
r.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: variable, and also Test2BOrds
|
||||||
|
}
|
Loading…
Reference in New Issue